DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and v_003_a due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Dates and Times in R
Introduction to dates - including the built-in methods for R:
Why use dates?
What about times?
Why lubridate?
Example code includes:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# The date R 3.0.0 was released
x <- "2013-04-03"
# Examine structure of x
str(x)
## chr "2013-04-03"
# Use as.Date() to interpret x as a date
x_date <- as.Date(x)
# Examine structure of x_date
str(x_date)
## Date[1:1], format: "2013-04-03"
# Store April 10 2014 as a Date
april_10_2014 <- as.Date("2014-04-10")
# Load the readr package
library(readr)
# Use read_csv() to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine the structure of the date column
str(releases$date)
## Date[1:105], format: "1997-12-04" "1997-12-21" "1998-01-10" "1998-03-14" ...
# Load the anytime package
library(anytime)
# Various ways of writing Sep 10 2009
sep_10_2009 <- c("September 10 2009", "2009-09-10", "10 Sep 2009", "09-10-2009")
# Use anytime() to parse sep_10_2009
anytime(sep_10_2009)
## [1] "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT" "2009-09-10 CDT"
# Set the x axis to the date column
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major)))
# Limit the axis to between 2010-01-01 and 2014-01-01
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
xlim(as.Date("2010-01-01"), as.Date("2014-01-01"))
## Warning: Removed 87 rows containing missing values (geom_path).
# Specify breaks every ten years and labels with "%Y"
ggplot(releases, aes(x = date, y = type)) +
geom_line(aes(group = 1, color = factor(major))) +
scale_x_date(date_breaks = "10 years", date_labels = "%Y")
# Find the largest date
last_release_date <- max(releases$date)
# Filter row for last release
last_release <- filter(releases, date == last_release_date)
# Print last_release
last_release
## # A tibble: 1 x 7
## major minor patch date datetime time type
## <int> <int> <int> <date> <dttm> <time> <chr>
## 1 3 4 1 2017-06-30 2017-06-30 07:04:11 07:04 patch
# How long since last release?
Sys.Date() - last_release_date
## Time difference of 258 days
# Use as.POSIXct to enter the datetime
as.POSIXct("2010-10-01 12:12:00")
## [1] "2010-10-01 12:12:00 CDT"
# Use as.POSIXct again but set the timezone to `"America/Los_Angeles"`
as.POSIXct("2010-10-01 12:12:00", tz = "America/Los_Angeles")
## [1] "2010-10-01 12:12:00 PDT"
# Use read_csv to import rversions.csv
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
# Examine structure of datetime column
str(releases$datetime)
## POSIXct[1:105], format: "1997-12-04 08:47:58" "1997-12-21 13:09:22" ...
# Import "cran-logs_2015-04-17.csv" with read_csv()
logs <- read_csv("./RInputFiles/cran-logs_2015-04-17.csv")
## Parsed with column specification:
## cols(
## datetime = col_datetime(format = ""),
## r_version = col_character(),
## country = col_character()
## )
# Print logs
logs
## # A tibble: 100,000 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-16 22:40:19 3.1.3 CO
## 2 2015-04-16 09:11:04 3.1.3 GB
## 3 2015-04-16 17:12:37 3.1.3 DE
## 4 2015-04-18 12:34:43 3.2.0 GB
## 5 2015-04-16 04:49:18 3.1.3 PE
## 6 2015-04-16 06:40:44 3.1.3 TW
## 7 2015-04-16 00:21:36 3.1.3 US
## 8 2015-04-16 10:27:23 3.1.3 US
## 9 2015-04-16 01:59:43 3.1.3 SG
## 10 2015-04-18 15:41:32 3.2.0 CA
## # ... with 99,990 more rows
# Store the release time as a POSIXct object
release_time <- as.POSIXct("2015-04-16 07:13:33", tz = "UTC")
# When is the first download of 3.2.0?
logs %>%
filter(r_version == "3.2.0")
## # A tibble: 35,928 x 3
## datetime r_version country
## <dttm> <chr> <chr>
## 1 2015-04-18 12:34:43 3.2.0 GB
## 2 2015-04-18 15:41:32 3.2.0 CA
## 3 2015-04-18 14:58:41 3.2.0 IE
## 4 2015-04-18 16:44:45 3.2.0 US
## 5 2015-04-18 04:34:35 3.2.0 US
## 6 2015-04-18 22:29:45 3.2.0 CH
## 7 2015-04-17 16:21:06 3.2.0 US
## 8 2015-04-18 20:34:57 3.2.0 AT
## 9 2015-04-17 18:23:19 3.2.0 US
## 10 2015-04-18 03:00:31 3.2.0 US
## # ... with 35,918 more rows
# Examine histograms of downloads by version
ggplot(logs, aes(x = datetime)) +
geom_histogram() +
geom_vline(aes(xintercept = as.numeric(release_time)))+
facet_wrap(~ r_version, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chapter 2 - Parsing and Manipulating Dates with lubridate
Parsing dates with lubridate:
Weather in Auckland (data from Weather Underground, METAR from Auckland airport):
Extracting parts of a datetime:
Rounding datetimes:
Example code includes:
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(readr)
library(dplyr)
library(ggplot2)
library(ggridges)
library(stringr)
# Parse x
x <- "2010 September 20th" # 2010-09-20
ymd(x)
## [1] "2010-09-20"
# Parse y
y <- "02.01.2010" # 2010-01-02
dmy(y)
## [1] "2010-01-02"
# Parse z
z <- "Sep, 12th 2010 14:00" # 2010-09-12T14:00
mdy_hm(z)
## [1] "2010-09-12 14:00:00 UTC"
# Specify an order string to parse x
x <- "Monday June 1st 2010 at 4pm"
parse_date_time(x, orders = "AmdyIp")
## [1] "2010-06-01 16:00:00 UTC"
# Specify order to include both "mdy" and "dmy"
two_orders <- c("October 7, 2001", "October 13, 2002", "April 13, 2003",
"17 April 2005", "23 April 2017")
parse_date_time(two_orders, orders = c("mdy", "dmy"))
## [1] "2001-10-07 UTC" "2002-10-13 UTC" "2003-04-13 UTC" "2005-04-17 UTC"
## [5] "2017-04-23 UTC"
# Specify order to include "dOmY", "OmY" and "Y"
short_dates <- c("11 December 1282", "May 1372", "1253")
parse_date_time(short_dates, orders = c("dOmY", "OmY", "Y"))
## [1] "1282-12-11 UTC" "1372-05-01 UTC" "1253-01-01 UTC"
# Import CSV with read_csv()
akl_daily_raw <- read_csv("./RInputFiles/akl_weather_daily.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## max_temp = col_integer(),
## min_temp = col_integer(),
## mean_temp = col_integer(),
## mean_rh = col_integer(),
## events = col_character(),
## cloud_cover = col_integer()
## )
# Print akl_daily_raw
akl_daily_raw
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <chr> <int> <int> <int> <int> <chr> <int>
## 1 2007-9-1 60 51 56 75 <NA> 4
## 2 2007-9-2 60 53 56 82 Rain 4
## 3 2007-9-3 57 51 54 78 <NA> 6
## 4 2007-9-4 64 50 57 80 Rain 6
## 5 2007-9-5 53 48 50 90 Rain 7
## 6 2007-9-6 57 42 50 69 <NA> 1
## 7 2007-9-7 59 41 50 77 <NA> 4
## 8 2007-9-8 59 46 52 80 <NA> 5
## 9 2007-9-9 55 50 52 88 Rain 7
## 10 2007-9-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Parse date
akl_daily <- akl_daily_raw %>%
mutate(date = ymd(date))
# Print akl_daily
akl_daily
## # A tibble: 3,661 x 7
## date max_temp min_temp mean_temp mean_rh events cloud_cover
## <date> <int> <int> <int> <int> <chr> <int>
## 1 2007-09-01 60 51 56 75 <NA> 4
## 2 2007-09-02 60 53 56 82 Rain 4
## 3 2007-09-03 57 51 54 78 <NA> 6
## 4 2007-09-04 64 50 57 80 Rain 6
## 5 2007-09-05 53 48 50 90 Rain 7
## 6 2007-09-06 57 42 50 69 <NA> 1
## 7 2007-09-07 59 41 50 77 <NA> 4
## 8 2007-09-08 59 46 52 80 <NA> 5
## 9 2007-09-09 55 50 52 88 Rain 7
## 10 2007-09-10 59 50 54 82 Rain 4
## # ... with 3,651 more rows
# Plot to check work
ggplot(akl_daily, aes(x = date, y = max_temp)) +
geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).
# Import "akl_weather_hourly_2016.csv"
akl_hourly_raw <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Print akl_hourly_raw
akl_hourly_raw
## # A tibble: 17,454 x 10
## year month mday time temperature weather conditions events humidity
## <int> <int> <int> <time> <dbl> <chr> <chr> <chr> <int>
## 1 2016 1 1 00:00 68.0 Clear Clear <NA> 68
## 2 2016 1 1 00:30 68.0 Clear Clear <NA> 68
## 3 2016 1 1 01:00 68.0 Clear Clear <NA> 73
## 4 2016 1 1 01:30 68.0 Clear Clear <NA> 68
## 5 2016 1 1 02:00 68.0 Clear Clear <NA> 68
## 6 2016 1 1 02:30 68.0 Clear Clear <NA> 68
## 7 2016 1 1 03:00 68.0 Clear Clear <NA> 68
## 8 2016 1 1 03:30 68.0 Cloudy Partly Cl~ <NA> 68
## 9 2016 1 1 04:00 68.0 Cloudy Scattered~ <NA> 68
## 10 2016 1 1 04:30 66.2 Cloudy Partly Cl~ <NA> 73
## # ... with 17,444 more rows, and 1 more variable: date_utc <dttm>
# Use make_date() to combine year, month and mday
akl_hourly <- akl_hourly_raw %>%
mutate(date = make_date(year = year, month = month, day = mday))
# Parse datetime_string
akl_hourly <- akl_hourly %>%
mutate(
datetime_string = paste(date, time, sep = "T"),
datetime = ymd_hms(datetime_string)
)
# Print date, time and datetime columns of akl_hourly
akl_hourly %>% select(date, time, datetime)
## # A tibble: 17,454 x 3
## date time datetime
## <date> <time> <dttm>
## 1 2016-01-01 00:00 2016-01-01 00:00:00
## 2 2016-01-01 00:30 2016-01-01 00:30:00
## 3 2016-01-01 01:00 2016-01-01 01:00:00
## 4 2016-01-01 01:30 2016-01-01 01:30:00
## 5 2016-01-01 02:00 2016-01-01 02:00:00
## 6 2016-01-01 02:30 2016-01-01 02:30:00
## 7 2016-01-01 03:00 2016-01-01 03:00:00
## 8 2016-01-01 03:30 2016-01-01 03:30:00
## 9 2016-01-01 04:00 2016-01-01 04:00:00
## 10 2016-01-01 04:30 2016-01-01 04:30:00
## # ... with 17,444 more rows
# Plot to check work
ggplot(akl_hourly, aes(x = datetime, y = temperature)) +
geom_line()
# Examine the head() of release_time
releases <- read_csv("./RInputFiles/rversions.csv")
## Parsed with column specification:
## cols(
## major = col_integer(),
## minor = col_integer(),
## patch = col_integer(),
## date = col_date(format = ""),
## datetime = col_datetime(format = ""),
## time = col_time(format = ""),
## type = col_character()
## )
release_time <- releases %>% pull(datetime)
head(release_time)
## [1] "1997-12-04 08:47:58 UTC" "1997-12-21 13:09:22 UTC"
## [3] "1998-01-10 00:31:55 UTC" "1998-03-14 19:25:55 UTC"
## [5] "1998-05-02 07:58:17 UTC" "1998-06-14 12:56:20 UTC"
# Examine the head() of the months of release_time
head(month(release_time))
## [1] 12 12 1 3 5 6
# Extract the month of releases
month(release_time) %>% table()
## .
## 1 2 3 4 5 6 7 8 9 10 11 12
## 5 6 8 18 5 16 4 7 2 15 6 13
# Extract the year of releases
year(release_time) %>% table()
## .
## 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 2 10 9 6 6 5 5 4 4 4 4 6 5 4 6
## 2012 2013 2014 2015 2016 2017
## 4 4 4 5 5 3
# How often is the hour before 12 (noon)?
mean(hour(release_time) < 12)
## [1] 0.752381
# How often is the release in am?
mean(am(release_time))
## [1] 0.752381
# Use wday() to tabulate release by day of the week
wday(releases$datetime) %>% table()
## .
## 1 2 3 4 5 6 7
## 3 29 9 12 18 31 3
# Add label = TRUE to make table more readable
wday(releases$datetime, label=TRUE) %>% table()
## .
## Sun Mon Tue Wed Thu Fri Sat
## 3 29 9 12 18 31 3
# Create column wday to hold labelled week days
releases$wday <- wday(releases$datetime, label=TRUE)
# Plot barchart of weekday by type of release
ggplot(releases, aes(x=wday)) +
geom_bar() +
facet_wrap(~ type, ncol = 1, scale = "free_y")
# Add columns for year, yday and month
akl_daily <- akl_daily %>%
mutate(
year = year(date),
yday = yday(date),
month = month(date, label=TRUE))
# Plot max_temp by yday for all years
ggplot(akl_daily, aes(x = yday, y = max_temp)) +
geom_line(aes(group = year), alpha = 0.5)
## Warning: Removed 1 rows containing missing values (geom_path).
# Examine distribtion of max_temp by month
ggplot(akl_daily, aes(x = max_temp, y = month, height = ..density..)) +
geom_density_ridges(stat = "density")
## Warning: Removed 10 rows containing non-finite values (stat_density).
# Create new columns hour, month and rainy
akl_hourly <- akl_hourly %>%
mutate(
hour = hour(datetime),
month = month(datetime, label=TRUE),
rainy = (weather == "Precipitation")
)
# Filter for hours between 8am and 10pm (inclusive)
akl_day <- akl_hourly %>%
filter(hour >= 8, hour <= 22)
# Summarise for each date if there is any rain
rainy_days <- akl_day %>%
group_by(month, date) %>%
summarise(
any_rain = any(rainy)
)
# Summarise for each month, the number of days with rain
rainy_days %>%
summarise(
days_rainy = sum(any_rain)
)
## # A tibble: 12 x 2
## month days_rainy
## <ord> <int>
## 1 Jan 15
## 2 Feb 13
## 3 Mar 12
## 4 Apr 15
## 5 May 21
## 6 Jun 19
## 7 Jul 22
## 8 Aug 16
## 9 Sep 25
## 10 Oct 20
## 11 Nov 19
## 12 Dec 11
r_3_4_1 <- ymd_hms("2016-05-03 07:13:28 UTC")
# Round down to day
floor_date(r_3_4_1, unit = "day")
## [1] "2016-05-03 UTC"
# Round to nearest 5 minutes
round_date(r_3_4_1, unit = "5 minutes")
## [1] "2016-05-03 07:15:00 UTC"
# Round up to week
ceiling_date(r_3_4_1, unit = "week")
## [1] "2016-05-08 UTC"
# Subtract r_3_4_1 rounded down to day
r_3_4_1 - floor_date(r_3_4_1, unit = "day")
## Time difference of 7.224444 hours
# Create day_hour, datetime rounded down to hour
akl_hourly <- akl_hourly %>%
mutate(
day_hour = floor_date(datetime, unit = "hour")
)
# Count observations per hour
akl_hourly %>%
count(day_hour)
## # A tibble: 8,770 x 2
## day_hour n
## <dttm> <int>
## 1 2016-01-01 00:00:00 2
## 2 2016-01-01 01:00:00 2
## 3 2016-01-01 02:00:00 2
## 4 2016-01-01 03:00:00 2
## 5 2016-01-01 04:00:00 2
## 6 2016-01-01 05:00:00 2
## 7 2016-01-01 06:00:00 2
## 8 2016-01-01 07:00:00 2
## 9 2016-01-01 08:00:00 2
## 10 2016-01-01 09:00:00 2
## # ... with 8,760 more rows
# Find day_hours with n != 2
akl_hourly %>%
count(day_hour) %>%
filter(n != 2) %>%
arrange(desc(n))
## # A tibble: 92 x 2
## day_hour n
## <dttm> <int>
## 1 2016-04-03 02:00:00 4
## 2 2016-09-25 00:00:00 4
## 3 2016-06-26 09:00:00 1
## 4 2016-09-01 23:00:00 1
## 5 2016-09-02 01:00:00 1
## 6 2016-09-04 11:00:00 1
## 7 2016-09-04 16:00:00 1
## 8 2016-09-04 17:00:00 1
## 9 2016-09-05 00:00:00 1
## 10 2016-09-05 15:00:00 1
## # ... with 82 more rows
Chapter 3 - Arithmetic with Dates and Times
Taking differences of datetimes:
Time spans - difficult because they do not have a constant meaning (e.g., impact of daylight savings time):
Intervals - third option in lubridate for storing times:
Example code includes:
# The date of landing and moment of step
date_landing <- mdy("July 20, 1969")
moment_step <- mdy_hms("July 20, 1969, 02:56:15", tz = "UTC")
# How many days since the first man on the moon?
difftime(today(), date_landing, units = "days")
## Time difference of 17770 days
# How many seconds since the first man on the moon?
difftime(now(), moment_step, units = "secs")
## Time difference of 1535362760 secs
# Three dates
mar_11 <- ymd_hms("2017-03-11 12:00:00",
tz = "America/Los_Angeles")
mar_12 <- ymd_hms("2017-03-12 12:00:00",
tz = "America/Los_Angeles")
mar_13 <- ymd_hms("2017-03-13 12:00:00",
tz = "America/Los_Angeles")
# Difference between mar_13 and mar_12 in seconds
difftime(mar_13, mar_12, units = "secs")
## Time difference of 86400 secs
# Difference between mar_12 and mar_11 in seconds
difftime(mar_12, mar_11, units = "secs")
## Time difference of 82800 secs
# Add a period of one week to mon_2pm
mon_2pm <- dmy_hm("27 Aug 2018 14:00")
mon_2pm + weeks(1)
## [1] "2018-09-03 14:00:00 UTC"
# Add a duration of 81 hours to tue_9am
tue_9am <- dmy_hm("28 Aug 2018 9:00")
tue_9am + dhours(81)
## [1] "2018-08-31 18:00:00 UTC"
# Subtract a period of five years from today()
today() - years(5)
## [1] "2013-03-15"
# Subtract a duration of five years from today()
today() - dyears(5)
## [1] "2013-03-16"
# Time of North American Eclipse 2017
eclipse_2017 <- ymd_hms("2017-08-21 18:26:40")
# Duration of 29 days, 12 hours, 44 mins and 3 secs
synodic <- ddays(29) + dhours(12) + dminutes(44) + dseconds(3)
# 223 synodic months
saros <- 223 * synodic
# Add saros to eclipse_2017
eclipse_2017 + saros
## [1] "2035-09-02 02:09:49 UTC"
# Add a period of 8 hours to today
today_8am <- today() + hours(8)
# Sequence of two weeks from 1 to 26
every_two_weeks <- 1:26 * weeks(2)
# Create datetime for every two weeks for a year
today_8am + every_two_weeks
## [1] "2018-03-29 08:00:00 UTC" "2018-04-12 08:00:00 UTC"
## [3] "2018-04-26 08:00:00 UTC" "2018-05-10 08:00:00 UTC"
## [5] "2018-05-24 08:00:00 UTC" "2018-06-07 08:00:00 UTC"
## [7] "2018-06-21 08:00:00 UTC" "2018-07-05 08:00:00 UTC"
## [9] "2018-07-19 08:00:00 UTC" "2018-08-02 08:00:00 UTC"
## [11] "2018-08-16 08:00:00 UTC" "2018-08-30 08:00:00 UTC"
## [13] "2018-09-13 08:00:00 UTC" "2018-09-27 08:00:00 UTC"
## [15] "2018-10-11 08:00:00 UTC" "2018-10-25 08:00:00 UTC"
## [17] "2018-11-08 08:00:00 UTC" "2018-11-22 08:00:00 UTC"
## [19] "2018-12-06 08:00:00 UTC" "2018-12-20 08:00:00 UTC"
## [21] "2019-01-03 08:00:00 UTC" "2019-01-17 08:00:00 UTC"
## [23] "2019-01-31 08:00:00 UTC" "2019-02-14 08:00:00 UTC"
## [25] "2019-02-28 08:00:00 UTC" "2019-03-14 08:00:00 UTC"
jan_31 <- ymd("2018-01-31")
# A sequence of 1 to 12 periods of 1 month
month_seq <- 1:12 * months(1)
# Add 1 to 12 months to jan_31
jan_31 + month_seq
## [1] NA "2018-03-31" NA "2018-05-31" NA
## [6] "2018-07-31" "2018-08-31" NA "2018-10-31" NA
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m+%
jan_31 %m+% month_seq
## [1] "2018-02-28" "2018-03-31" "2018-04-30" "2018-05-31" "2018-06-30"
## [6] "2018-07-31" "2018-08-31" "2018-09-30" "2018-10-31" "2018-11-30"
## [11] "2018-12-31" "2019-01-31"
# Replace + with %m-%
jan_31 %m-% month_seq
## [1] "2017-12-31" "2017-11-30" "2017-10-31" "2017-09-30" "2017-08-31"
## [6] "2017-07-31" "2017-06-30" "2017-05-31" "2017-04-30" "2017-03-31"
## [11] "2017-02-28" "2017-01-31"
# Create monarchs
mNames <- c('Elizabeth II' ,'Victoria' ,'George V' ,'George III' ,'George VI' ,'George IV' ,'Edward VII' ,'William IV' ,'Edward VIII' ,'George III(also United Kingdom)' ,'George II' ,'George I' ,'Anne' ,'Henry III' ,'Edward III' ,'Elizabeth I' ,'Henry VI' ,'Henry VI' ,'Æthelred II' ,'Æthelred II' ,'Henry VIII' ,'Charles II' ,'Henry I' ,'Henry II(co-ruler with Henry the Young King)' ,'Edward I' ,'Alfred the Great' ,'Edward the Elder' ,'Charles I' ,'Henry VII' ,'Edward the Confessor' ,'Richard II' ,'James I' ,'Edward IV' ,'Edward IV' ,'William I' ,'Edward II' ,'Cnut' ,'Stephen' ,'Stephen' ,'John' ,'Edgar I' ,'Æthelstan' ,'Henry IV' ,'William III(co-ruler with Mary II)' ,'Henry the Young King(co-ruler with Henry II)' ,'William II' ,'Richard I' ,'Eadred' ,'Henry V' ,'Edmund I' ,'Edward VI' ,'Mary II(co-ruler with William III)' ,'Mary I' ,'Anne(also Kingdom of Great Britain)' ,'Eadwig' ,'James II' ,'Edward the Martyr' ,'Harold I' ,'Harthacnut' ,'Richard III' ,'Louis (disputed)' ,'Harold II' ,'Edmund II' ,'Matilda (disputed)' ,'Edward V' ,'Edgar II' ,'Sweyn Forkbeard' ,'Jane (disputed)' ,'James VI' ,'William I' ,'Constantine II' ,'David II' ,'Alexander III' ,'Malcolm III' ,'Alexander II' ,'James I' ,'Malcolm II' ,'James V' ,'David I' ,'James III' ,'Charles II' ,'Charles II' ,'James IV' ,'Mary I' ,'Charles I' ,'Kenneth II' ,'James II' ,'Robert I' ,'Robert II' ,'Alexander I' ,'Macbeth' ,'Robert III' ,'Constantine I' ,'Kenneth MacAlpin' ,'William II' ,'Malcolm IV' ,'Giric(co-ruler with Eochaid?)' ,'Donald II' ,'Malcolm I' ,'Edgar' ,'Kenneth III' ,'Indulf' ,'Duncan I' ,'Mary II' ,'Amlaíb' ,'Anne(also Kingdom of Great Britain)' ,'Dub' ,'Cuilén' ,'Domnall mac Ailpín' ,'James VII' ,'Margaret' ,'John Balliol' ,'Donald III' ,'Constantine III' ,'Áed mac Cináeda' ,'Lulach' ,'Duncan II' ,'Ruaidrí Ua Conchobair' ,'Edward Bruce (disputed)' ,'Brian Ua Néill (disputed)' ,'Gruffudd ap Cynan' ,'Llywelyn the Great' ,'Owain Gwynedd' ,'Dafydd ab Owain Gwynedd' ,'Hywel ab Owain Gwynedd' ,'Llywelyn ap Gruffudd' ,'Owain Glyndŵr (disputed)' ,'Owain Goch ap Gruffydd' ,'Owain Lawgoch (disputed)' ,'Dafydd ap Llywelyn' ,'Dafydd ap Gruffydd')
mDominion <- c('United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'United Kingdom' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'Great Britain' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'England' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Scotland' ,'Ireland' ,'Ireland' ,'Ireland' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Gwynedd' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales' ,'Wales')
mFrom <- c('1952-02-06' ,'1837-06-20' ,'1910-05-06' ,'1801-01-01' ,'1936-12-11' ,'1820-01-29' ,'1901-01-22' ,'1830-06-26' ,'1936-01-20' ,'1760-10-25' ,'1727-06-22' ,'1714-08-01' ,'1707-05-01' ,'NA' ,'1327-01-25' ,'1558-11-17' ,'1422-08-31' ,'1470-10-31' ,'978-03-18' ,'1014-02-03' ,'1509-04-22' ,'1649-01-30' ,'1100-08-03' ,'1154-10-25' ,'1272-11-20' ,'871-04-24' ,'899-10-27' ,'1625-03-27' ,'1485-08-22' ,'1042-06-08' ,'1377-06-22' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1066-12-12' ,'1307-07-07' ,'1016-11-30' ,'1135-12-22' ,'1141-11-01' ,'1199-04-06' ,'959-10-01' ,'924-08-02' ,'1399-09-29' ,'1689-02-13' ,'1170-06-14' ,'1087-09-09' ,'1189-07-06' ,'946-05-26' ,'1413-03-21' ,'939-10-27' ,'1547-01-28' ,'1689-02-13' ,'1553-07-19' ,'1702-03-08' ,'955-11-23' ,'1685-02-06' ,'975-07-09' ,'1037-11-12' ,'1040-03-17' ,'1483-06-26' ,'1216-06-14' ,'1066-01-05' ,'1016-04-23' ,'1141-04-07' ,'1483-04-09' ,'1066-10-15' ,'1013-12-25' ,'1553-07-10' ,'1567-07-24' ,'1165-12-09' ,'900-01-01' ,'1329-06-07' ,'1249-07-06' ,'1058-03-17' ,'1214-12-04' ,'1406-04-04' ,'1005-03-25' ,'1513-09-09' ,'1124-04-23' ,'1460-08-03' ,'1649-01-30' ,'1660-05-29' ,'1488-06-11' ,'1542-12-14' ,'1625-03-27' ,'971-01-01' ,'1437-02-21' ,'1306-03-25' ,'1371-02-22' ,'1107-01-08' ,'1040-08-14' ,'1390-04-19' ,'862-01-01' ,'843-01-01' ,'1689-05-11' ,'1153-05-24' ,'878-01-01' ,'889-01-01' ,'943-01-01' ,'1097-01-01' ,'997-01-01' ,'954-01-01' ,'1034-11-25' ,'1689-04-11' ,'971-01-01' ,'1702-03-08' ,'962-01-01' ,'NA' ,'858-01-01' ,'1685-02-06' ,'1286-11-25' ,'1292-11-17' ,'1093-11-13' ,'1095-01-01' ,'877-01-01' ,'1057-08-15' ,'1094-05-01' ,'1166-01-01' ,'1315-06-01' ,'1258-01-01' ,'1081-01-01' ,'1195-01-01' ,'1137-01-01' ,'1170-01-01' ,'1170-01-01' ,'1253-01-01' ,'1400-09-16' ,'1246-02-25' ,'1372-05-01' ,'1240-04-12' ,'1282-12-11')
mTo <- c('2018-02-08' ,'1901-01-22' ,'1936-01-20' ,'1820-01-29' ,'1952-02-06' ,'1830-06-26' ,'1910-05-06' ,'1837-06-20' ,'1936-12-11' ,'1801-01-01' ,'1760-10-25' ,'1727-06-11' ,'1714-08-01' ,'1272-11-16' ,'1377-06-21' ,'1603-03-24' ,'1461-03-04' ,'1471-04-11' ,'1013-12-25' ,'1016-04-23' ,'1547-01-28' ,'1685-02-06' ,'1135-12-01' ,'1189-07-06' ,'1307-07-07' ,'899-10-26' ,'924-07-17' ,'1649-01-30' ,'1509-04-21' ,'1066-01-05' ,'1399-09-29' ,'1625-03-27' ,'1470-10-03' ,'1483-04-09' ,'1087-09-09' ,'1327-01-20' ,'1035-11-12' ,'1141-04-07' ,'1154-10-25' ,'1216-10-19' ,'975-07-08' ,'939-10-27' ,'1413-03-20' ,'1702-03-08' ,'1183-06-11' ,'1100-08-02' ,'1199-04-06' ,'955-11-23' ,'1422-08-31' ,'946-05-26' ,'1553-07-06' ,'1694-12-28' ,'1558-11-17' ,'1707-04-30' ,'959-10-01' ,'1688-12-11' ,'978-03-18' ,'1040-03-17' ,'1042-06-08' ,'1485-08-22' ,'1217-09-22' ,'1066-10-14' ,'1016-11-30' ,'1141-11-01' ,'1483-06-26' ,'1066-12-17' ,'1014-02-03' ,'1553-07-19' ,'1625-03-27' ,'1214-12-04' ,'943-01-01' ,'1371-02-22' ,'1286-03-19' ,'1093-11-13' ,'1249-07-06' ,'1437-02-21' ,'1034-11-25' ,'1542-12-14' ,'1153-05-24' ,'1488-06-11' ,'1651-09-03' ,'1685-02-06' ,'1513-09-09' ,'1567-07-24' ,'1649-01-30' ,'995-01-01' ,'1460-08-03' ,'1329-06-07' ,'1390-04-19' ,'1124-04-23' ,'1057-08-15' ,'1406-04-04' ,'877-01-01' ,'858-02-13' ,'1702-03-08' ,'1165-12-09' ,'889-01-01' ,'900-01-01' ,'954-01-01' ,'1107-01-08' ,'1005-03-25' ,'962-01-01' ,'1040-08-14' ,'1694-12-28' ,'977-01-01' ,'1707-04-30' ,'NA' ,'971-01-01' ,'862-04-13' ,'1688-12-11' ,'1290-09-26' ,'1296-07-10' ,'1097-01-01' ,'1097-01-01' ,'878-01-01' ,'1058-03-17' ,'1094-11-12' ,'1193-01-01' ,'1318-10-14' ,'1260-01-01' ,'1137-01-01' ,'1240-04-11' ,'1170-01-01' ,'1195-01-01' ,'1170-01-01' ,'1282-12-11' ,'1416-01-01' ,'1255-01-01' ,'1378-07-01' ,'1246-02-25' ,'1283-10-03')
padMDate <- function(x) {
if (is.na(x[1]) | x[1] == "NA") {
NA
} else {
paste0(c(str_pad(x[1], 4, pad="0"), x[2], x[3]), collapse="-")
}
}
monarchs <- tibble::tibble(name=mNames, dominion=mDominion,
from=ymd(sapply(str_split(mFrom, "-"), FUN=padMDate)),
to=ymd(sapply(str_split(mTo, "-"), FUN=padMDate))
)
# Print monarchs
monarchs
## # A tibble: 131 x 4
## name dominion from to
## <chr> <chr> <date> <date>
## 1 Elizabeth II United Kingdom 1952-02-06 2018-02-08
## 2 Victoria United Kingdom 1837-06-20 1901-01-22
## 3 George V United Kingdom 1910-05-06 1936-01-20
## 4 George III United Kingdom 1801-01-01 1820-01-29
## 5 George VI United Kingdom 1936-12-11 1952-02-06
## 6 George IV United Kingdom 1820-01-29 1830-06-26
## 7 Edward VII United Kingdom 1901-01-22 1910-05-06
## 8 William IV United Kingdom 1830-06-26 1837-06-20
## 9 Edward VIII United Kingdom 1936-01-20 1936-12-11
## 10 George III(also United Kingdom) Great Britain 1760-10-25 1801-01-01
## # ... with 121 more rows
# Create an interval for reign
monarchs <- monarchs %>%
mutate(reign = from %--% to)
# Find the length of reign, and arrange
monarchs %>%
mutate(length = int_length(reign)) %>%
arrange(desc(length)) %>%
select(name, length, dominion)
## # A tibble: 131 x 3
## name length dominion
## <chr> <dbl> <chr>
## 1 Elizabeth II 2083017600 United Kingdom
## 2 Victoria 2006726400 United Kingdom
## 3 James VI 1820102400 Scotland
## 4 Gruffudd ap Cynan 1767139200 Gwynedd
## 5 Edward III 1590624000 England
## 6 William I 1545868800 Scotland
## 7 Llywelyn the Great 1428796800 Gwynedd
## 8 Elizabeth I 1399507200 England
## 9 Constantine II 1356912000 Scotland
## 10 David II 1316304000 Scotland
## # ... with 121 more rows
# Print halleys
pDate <- c('66-01-26', '141-03-25', '218-04-06', '295-04-07', '374-02-13', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-25', '912-07-27', '989-09-02', '1066-03-25', '1145-04-19', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-11-16', '1910-04-20', '1986-02-09', '2061-07-28')
sDate <- c('66-01-25', '141-03-22', '218-04-06', '295-04-07', '374-02-13', '451-06-28', '530-09-27', '607-03-15', '684-10-02', '760-05-20', '837-02-25', '912-07-18', '989-09-02', '1066-01-01', '1145-04-15', '1222-09-10', '1301-10-22', '1378-11-09', '1456-01-08', '1531-08-26', '1607-10-27', '1682-09-15', '1758-03-13', '1835-08-01', '1910-04-20', '1986-02-09', '2061-07-28')
eDate <- c('66-01-26', '141-03-25', '218-05-17', '295-04-20', '374-02-16', '451-07-03', '530-11-15', '607-03-26', '684-11-26', '760-06-10', '837-02-28', '912-07-27', '989-09-05', '1066-03-25', '1145-04-19', '1222-09-28', '1301-10-31', '1378-11-14', '1456-06-09', '1531-08-26', '1607-10-27', '1682-09-15', '1758-12-25', '1835-11-16', '1910-05-20', '1986-02-09', '2061-07-28')
halleys <- tibble::tibble(perihelion_date=ymd(sapply(str_split(pDate, "-"), FUN=padMDate)),
start_date=ymd(sapply(str_split(sDate, "-"), FUN=padMDate)),
end_date=ymd(sapply(str_split(eDate, "-"), FUN=padMDate))
)
# New column for interval from start to end date
halleys <- halleys %>%
mutate(visible = start_date %--% end_date)
# The visitation of 1066
halleys_1066 <- halleys[14, ]
# Monarchs in power on perihelion date
monarchs %>%
filter(halleys_1066$perihelion_date %within% reign) %>%
select(name, from, to, dominion)
## # A tibble: 2 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Harold II 1066-01-05 1066-10-14 England
## 2 Malcolm III 1058-03-17 1093-11-13 Scotland
# Monarchs whose reign overlaps visible time
monarchs %>%
filter(int_overlaps(halleys_1066$visible, reign)) %>%
select(name, from, to, dominion)
## # A tibble: 3 x 4
## name from to dominion
## <chr> <date> <date> <chr>
## 1 Edward the Confessor 1042-06-08 1066-01-05 England
## 2 Harold II 1066-01-05 1066-10-14 England
## 3 Malcolm III 1058-03-17 1093-11-13 Scotland
# New columns for duration and period
monarchs <- monarchs %>%
mutate(
duration = as.duration(reign),
period = as.period(reign))
# Examine results
monarchs %>%
select(name, duration, period) %>%
head(10) %>%
print.data.frame()
## name duration
## 1 Elizabeth II 2083017600s (~66.01 years)
## 2 Victoria 2006726400s (~63.59 years)
## 3 George V 811296000s (~25.71 years)
## 4 George III 601948800s (~19.07 years)
## 5 George VI 478224000s (~15.15 years)
## 6 George IV 328406400s (~10.41 years)
## 7 Edward VII 292982400s (~9.28 years)
## 8 William IV 220406400s (~6.98 years)
## 9 Edward VIII 28166400s (~46.57 weeks)
## 10 George III(also United Kingdom) 1268092800s (~40.18 years)
## period
## 1 66y 0m 2d 0H 0M 0S
## 2 63y 7m 2d 0H 0M 0S
## 3 25y 8m 14d 0H 0M 0S
## 4 19y 0m 28d 0H 0M 0S
## 5 15y 1m 26d 0H 0M 0S
## 6 10y 4m 28d 0H 0M 0S
## 7 9y 3m 14d 0H 0M 0S
## 8 6y 11m 25d 0H 0M 0S
## 9 10m 21d 0H 0M 0S
## 10 40y 2m 7d 0H 0M 0S
Chapter 4 - Problems in Practice
Time zones - ways to keep track of times in different locations (can pose analysis challenges):
Importing and exporting datetimes:
Wrap-up:
Example code includes:
# Game2: CAN vs NZL in Edmonton
game2 <- mdy_hm("June 11 2015 19:00")
# Game3: CHN vs NZL in Winnipeg
game3 <- mdy_hm("June 15 2015 18:30")
# Set the timezone to "America/Edmonton"
game2_local <- force_tz(game2, tzone = "America/Edmonton")
game2_local
## [1] "2015-06-11 19:00:00 MDT"
# Set the timezone to "America/Winnipeg"
game3_local <- force_tz(game3, tzone = "America/Winnipeg")
game3_local
## [1] "2015-06-15 18:30:00 CDT"
# How long does the team have to rest?
as.period(game2_local %--% game3_local)
## [1] "3d 22H 30M 0S"
# What time is game2_local in NZ?
with_tz(game2_local, tzone = "Pacific/Auckland")
## [1] "2015-06-12 13:00:00 NZST"
# What time is game2_local in Corvallis, Oregon?
with_tz(game2_local, tzone = "America/Los_Angeles")
## [1] "2015-06-11 18:00:00 PDT"
# What time is game3_local in NZ?
with_tz(game3_local, tzone = "Pacific/Auckland")
## [1] "2015-06-16 11:30:00 NZST"
# Examine datetime and date_utc columns
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 UTC" "2016-01-01 00:30:00 UTC"
## [3] "2016-01-01 01:00:00 UTC" "2016-01-01 01:30:00 UTC"
## [5] "2016-01-01 02:00:00 UTC" "2016-01-01 02:30:00 UTC"
head(akl_hourly$date_utc)
## [1] "2015-12-31 11:00:00 UTC" "2015-12-31 11:30:00 UTC"
## [3] "2015-12-31 12:00:00 UTC" "2015-12-31 12:30:00 UTC"
## [5] "2015-12-31 13:00:00 UTC" "2015-12-31 13:30:00 UTC"
# Force datetime to Pacific/Auckland
akl_hourly <- akl_hourly %>%
mutate(
datetime = force_tz(datetime, tzone = "Pacific/Auckland"))
# Reexamine datetime
head(akl_hourly$datetime)
## [1] "2016-01-01 00:00:00 NZDT" "2016-01-01 00:30:00 NZDT"
## [3] "2016-01-01 01:00:00 NZDT" "2016-01-01 01:30:00 NZDT"
## [5] "2016-01-01 02:00:00 NZDT" "2016-01-01 02:30:00 NZDT"
# Are datetime and date_utc the same moments
table(akl_hourly$datetime - akl_hourly$date_utc)
##
## -82800 0 3600
## 2 17450 2
# Import auckland hourly data
akl_hourly <- read_csv("./RInputFiles/akl_weather_hourly_2016.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## month = col_integer(),
## mday = col_integer(),
## time = col_time(format = ""),
## temperature = col_double(),
## weather = col_character(),
## conditions = col_character(),
## events = col_character(),
## humidity = col_integer(),
## date_utc = col_datetime(format = "")
## )
# Examine structure of time column
str(akl_hourly$time)
## Classes 'hms', 'difftime' atomic [1:17454] 0 1800 3600 5400 7200 9000 10800 12600 14400 16200 ...
## ..- attr(*, "units")= chr "secs"
# Examine head of time column
head(akl_hourly$time)
## 00:00:00
## 00:30:00
## 01:00:00
## 01:30:00
## 02:00:00
## 02:30:00
# A plot using just time
ggplot(akl_hourly, aes(x = time, y = temperature)) +
geom_line(aes(group = make_date(year, month, mday)), alpha = 0.2)
library(microbenchmark)
library(fasttime)
# Examine structure of dates
dates <- paste0(gsub(" ", "T", as.character(akl_hourly$date_utc)), "Z")
str(dates)
## chr [1:17454] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" ...
# Use fastPOSIXct() to parse dates
fastPOSIXct(dates) %>% str()
## POSIXct[1:17454], format: "2015-12-31 05:00:00" "2015-12-31 05:30:00" ...
# Compare speed of fastPOSIXct() to ymd_hms()
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq max
## ymd_hms 23.401424 36.957481 39.019617 38.743348 43.595864 48.422524
## fasttime 1.678496 3.167903 3.260493 3.287514 3.557131 5.230494
## neval cld
## 20 b
## 20 a
# Head of dates
head(dates)
## [1] "2015-12-31T11:00:00Z" "2015-12-31T11:30:00Z" "2015-12-31T12:00:00Z"
## [4] "2015-12-31T12:30:00Z" "2015-12-31T13:00:00Z" "2015-12-31T13:30:00Z"
# Parse dates with fast_strptime
fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ") %>% str()
## POSIXlt[1:17454], format: "2015-12-31 11:00:00" "2015-12-31 11:30:00" ...
# Comparse speed to ymd_hms() and fasttime
microbenchmark(
ymd_hms = ymd_hms(dates),
fasttime = fastPOSIXct(dates),
fast_strptime = fast_strptime(dates,
format = "%Y-%m-%dT%H:%M:%SZ"),
times = 20)
## Unit: milliseconds
## expr min lq mean median uq
## ymd_hms 18.784773 24.114152 37.054787 34.139330 39.530093
## fasttime 1.570728 1.655403 2.821585 1.931730 3.369030
## fast_strptime 1.274662 1.340783 1.796211 1.820212 2.231942
## max neval cld
## 141.605829 20 b
## 9.164614 20 a
## 2.393396 20 a
finished <- "I finished 'Dates and Times in R' on Thursday, September 20, 2017!"
# Create a stamp based on "Sep 20 2017"
date_stamp <- stamp("September 20, 2017", orders="mdy")
## Multiple formats matched: "%Om %d, %Y"(1), "%B %d, %Y"(1)
## Using: "%B %d, %Y"
# Print date_stamp
date_stamp
## function (x, locale = "English_United States.1252")
## {
## {
## old_lc_time <- Sys.getlocale("LC_TIME")
## if (old_lc_time != locale) {
## Sys.setlocale("LC_TIME", locale)
## on.exit(Sys.setlocale("LC_TIME", old_lc_time))
## }
## }
## format(x, format = "%B %d, %Y")
## }
## <environment: 0x000000001184dd18>
# Call date_stamp on today()
date_stamp(today())
## [1] "March 15, 2018"
# Create and call a stamp based on "09/20/2017"
stamp("09/20/2017", orders="mdy")(today())
## Multiple formats matched: "%Om/%d/%Y"(1), "%m/%d/%Y"(1)
## Using: "%Om/%d/%Y"
## [1] "03/15/2018"
# Use string finished for stamp()
stamp(finished, orders="amdy")(today())
## Multiple formats matched: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"(1), "I finished 'Dates and Times in R' on %A, %Om %d, %Y!"(0)
## Using: "I finished 'Dates and Times in R' on %A, %B %d, %Y!"
## [1] "I finished 'Dates and Times in R' on Thursday, March 15, 2018!"
Chapter 1 - Working with Increasingly Large Data Sets
What is scalable data processing?:
Working with “out of core” objects using the Bigmemory Project:
References vs. Copies:
Example code includes:
# Load the microbenchmark package
library(microbenchmark)
# Compare the timings for sorting different sizes of vector
mb <- microbenchmark(
# Sort a random normal vector length 1e5
"1e5" = sort(rnorm(1e5)),
# Sort a random normal vector length 2.5e5
"2.5e5" = sort(rnorm(2.5e5)),
# Sort a random normal vector length 5e5
"5e5" = sort(rnorm(5e5)),
"7.5e5" = sort(rnorm(7.5e5)),
"1e6" = sort(rnorm(1e6)),
times = 10
)
# Plot the resulting benchmark object
plot(mb)
# Load the bigmemory package
library(bigmemory)
# Create the big.matrix object: x
x <- read.big.matrix("./RInputFiles/mortgage-sample.csv", header = TRUE,
type = "integer",
backingfile = "mortgage-sample.bin",
descriptorfile = "mortgage-sample.desc")
# Find the dimensions of x
dim(x)
## [1] 70000 16
# Attach mortgage-sample.desc
mort <- attach.big.matrix("mortgage-sample.desc")
# Find the dimensions of mort
dim(mort)
## [1] 70000 16
# Look at the first 6 rows of mort
head(mort)
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## [4,] 1 62 1 2 3
## [5,] 1 106 1 2 3
## [6,] 1 759 1 3 3
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## [4,] 3 2 4 5
## [5,] 3 2 4 9
## [6,] 2 2 4 9
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## [4,] 9 2 4 1
## [5,] 9 3 4 1
## [6,] 9 1 2 2
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Create mort
mort <- attach.big.matrix("mortgage-sample.desc")
# Look at the first 3 rows
mort[1:3, ]
## enterprise record_number msa perc_minority tract_income_ratio
## [1,] 1 566 1 1 3
## [2,] 1 116 1 3 2
## [3,] 1 239 1 2 2
## borrower_income_ratio loan_purpose federal_guarantee borrower_race
## [1,] 1 2 4 3
## [2,] 1 2 4 5
## [3,] 3 8 4 5
## co_borrower_race borrower_gender co_borrower_gender num_units
## [1,] 9 2 4 1
## [2,] 9 1 4 1
## [3,] 5 1 2 1
## affordability year type
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
# Create a table of the number of mortgages for each year in the data set
table(mort[, "year"])
##
## 2008 2009 2010 2011 2012 2013 2014 2015
## 8468 11101 8836 7996 10935 10216 5714 6734
a <- getLoadedDLLs()
length(a)
## [1] 39
R.utils::gcDLLs()
## named list()
a <- getLoadedDLLs()
length(a)
## [1] 39
# Load the biganalytics package (error in loading to Knit file, works OK otherwise)
library(biganalytics)
## Loading required package: foreach
## Loading required package: biglm
## Loading required package: DBI
# Get the column means of mort
colmean(mort)
## enterprise record_number msa
## 1.3814571 499.9080571 0.8943571
## perc_minority tract_income_ratio borrower_income_ratio
## 1.9701857 2.3431571 2.6898857
## loan_purpose federal_guarantee borrower_race
## 3.7670143 3.9840857 5.3572429
## co_borrower_race borrower_gender co_borrower_gender
## 7.0002714 1.4590714 3.0494857
## num_units affordability year
## 1.0398143 4.2863429 2011.2714714
## type
## 0.5300429
# Use biganalytics' summary function to get a summary of the data
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
# Use deepcopy() to create first_three
first_three <- deepcopy(mort, cols = 1:3,
backingfile = "first_three.bin",
descriptorfile = "first_three.desc")
# Set first_three_2 equal to first_three
first_three_2 <- first_three
# Set the value in the first row and first column of first_three to NA
first_three[1, 1] <- NA
# Verify the change shows up in first_three_2
first_three_2[1, 1]
## [1] NA
# but not in mort
mort[1, 1]
## [1] 1
Chapter 2 - Processing and Analyzing Data with bigmemory
The Bigmemory Suite of Packages:
Split-Apply-Combine (aka Split-Compute-Combine), run in this course using split() Map() Reduce():
Visualize results using tidyverse:
Limitations of bigmemory - process is useful for dense, numeric matrices that can be stored on hard disk:
Example code includes:
library(bigtabulate)
library(tidyr)
library(ggplot2)
library(biganalytics)
library(dplyr)
race_cat <- c('Native Am', 'Asian', 'Black', 'Pacific Is', 'White', 'Two or More', 'Hispanic', 'Not Avail')
# Call bigtable to create a variable called race_table
race_table <- bigtable(mort, "borrower_race")
# Rename the elements of race_table
names(race_table) <- race_cat
race_table
## Native Am Asian Black Pacific Is White Two or More
## 143 4438 2020 195 50006 528
## Hispanic Not Avail
## 4040 8630
# Create a table of the borrower race by year
race_year_table <- bigtable(mort, c("borrower_race", "year"))
# Convert rydf to a data frame
rydf <- as.data.frame(race_year_table)
# Create the new column Race
rydf$Race <- race_cat
# Let's see what it looks like
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
## 9 1505 1755 1240 1013 1009 971 519 618 Not Avail
female_residence_prop <- function(x, rows) {
x_subset <- x[rows, ]
# Find the proporation of female borrowers in urban areas
prop_female_urban <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 1) /
sum(x_subset[, "msa"] == 1)
# Find the proporation of female borrowers in rural areas
prop_female_rural <- sum(x_subset[, "borrower_gender"] == 2 &
x_subset[, "msa"] == 0) /
sum(x_subset[, "msa"] == 0)
c(prop_female_urban, prop_female_rural)
}
# Find the proportion of female borrowers in 2015
female_residence_prop(mort, mort[, "year"] == 2015)
## [1] 0.2737439 0.2304965
# Split the row numbers of the mortage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Call str on spl
str(spl)
## List of 8
## $ 2008: int [1:8468] 2 8 15 17 18 28 35 40 42 47 ...
## $ 2009: int [1:11101] 4 13 25 31 43 49 52 56 67 68 ...
## $ 2010: int [1:8836] 1 6 7 10 21 23 24 27 29 38 ...
## $ 2011: int [1:7996] 11 20 37 46 53 57 73 83 86 87 ...
## $ 2012: int [1:10935] 14 16 26 30 32 33 48 69 81 94 ...
## $ 2013: int [1:10216] 5 9 19 22 36 44 55 58 72 74 ...
## $ 2014: int [1:5714] 3 12 50 60 64 66 103 114 122 130 ...
## $ 2015: int [1:6734] 34 41 54 61 62 65 82 91 102 135 ...
# For each of the row splits, find the female residence proportion
all_years <- Map(function(rows) female_residence_prop(mort, rows), spl)
# Call str on all_years
str(all_years)
## List of 8
## $ 2008: num [1:2] 0.275 0.204
## $ 2009: num [1:2] 0.244 0.2
## $ 2010: num [1:2] 0.241 0.201
## $ 2011: num [1:2] 0.252 0.241
## $ 2012: num [1:2] 0.244 0.21
## $ 2013: num [1:2] 0.275 0.257
## $ 2014: num [1:2] 0.289 0.268
## $ 2015: num [1:2] 0.274 0.23
# Collect the results as rows in a matrix
prop_female <- Reduce(rbind, all_years)
# Rename the row and column names
dimnames(prop_female) <- list(names(all_years), c("prop_female_urban", "prop_femal_rural"))
# View the matrix
prop_female
## prop_female_urban prop_femal_rural
## 2008 0.2748514 0.2039474
## 2009 0.2441074 0.2002978
## 2010 0.2413881 0.2014028
## 2011 0.2520644 0.2408931
## 2012 0.2438950 0.2101313
## 2013 0.2751059 0.2567164
## 2014 0.2886756 0.2678571
## 2015 0.2737439 0.2304965
# Convert prop_female to a data frame
prop_female_df <- as.data.frame(prop_female)
# Add a new column Year
prop_female_df$Year <- row.names(prop_female_df)
# Call gather on prop_female_df
prop_female_long <- gather(prop_female_df, Region, Prop, -Year)
# Create a line plot
ggplot(prop_female_long, aes(x = Year, y = Prop, group = Region, color = Region)) +
geom_line()
# Call summary on mort
summary(mort)
## min max mean NAs
## enterprise 1.0000000 2.0000000 1.3814571 0.0000000
## record_number 0.0000000 999.0000000 499.9080571 0.0000000
## msa 0.0000000 1.0000000 0.8943571 0.0000000
## perc_minority 1.0000000 9.0000000 1.9701857 0.0000000
## tract_income_ratio 1.0000000 9.0000000 2.3431571 0.0000000
## borrower_income_ratio 1.0000000 9.0000000 2.6898857 0.0000000
## loan_purpose 1.0000000 9.0000000 3.7670143 0.0000000
## federal_guarantee 1.0000000 4.0000000 3.9840857 0.0000000
## borrower_race 1.0000000 9.0000000 5.3572429 0.0000000
## co_borrower_race 1.0000000 9.0000000 7.0002714 0.0000000
## borrower_gender 1.0000000 9.0000000 1.4590714 0.0000000
## co_borrower_gender 1.0000000 9.0000000 3.0494857 0.0000000
## num_units 1.0000000 4.0000000 1.0398143 0.0000000
## affordability 0.0000000 9.0000000 4.2863429 0.0000000
## year 2008.0000000 2015.0000000 2011.2714714 0.0000000
## type 0.0000000 1.0000000 0.5300429 0.0000000
bir_df_wide <- bigtable(mort, c("borrower_income_ratio", "year")) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
filter(rowname %in% c(1, 2, 3)) %>%
select(-rowname) %>%
# Create a new column called BIR with the corresponding table categories
mutate(BIR = c(">=0,<=50%", ">50, <=80%", ">80%"))
bir_df_wide
## 2008 2009 2010 2011 2012 2013 2014 2015 BIR
## 1 1205 1473 600 620 745 725 401 380 >=0,<=50%
## 2 2095 2791 1554 1421 1819 1861 1032 1145 >50, <=80%
## 3 4844 6707 6609 5934 8338 7559 4255 5169 >80%
bir_df_wide %>%
# Transform the wide-formatted data.frame into the long format
gather(Year, Count, -BIR) %>%
# Use ggplot to create a line plot
ggplot(aes(x = Year, y = Count, group = BIR, color = BIR)) +
geom_line()
Chapter 3 - Working with iotools
Introduction to chunk-wise processing - solution to challenges from bigmemory:
First look at iotools: Importing data:
Using chunk.apply - effectively moves away from what is functionally a “for loop” to allow better parallel processing:
Example code includes:
foldable_range <- function(x) {
if (is.list(x)) {
# If x is a list then reduce it by the min and max of each element in the list
c(Reduce(min, x), Reduce(max, x))
} else {
# Otherwise, assume it's a vector and find it's range
range(x)
}
}
# Verify that foldable_range() works on the record_number column
foldable_range(mort[, "record_number"])
## [1] 0 999
# Split the mortgage data by year
spl <- split(1:nrow(mort), mort[, "year"])
# Use foldable_range() to get the range of the record numbers
foldable_range(Map(function(s) foldable_range(mort[s, "record_number"]), spl))
## [1] 0 999
# Load the iotools and microbenchmark packages
library(iotools)
library(microbenchmark)
# Time the reading of files
microbenchmark(
# Time the reading of a file using read.delim five times
read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
# Time the reading of a file using read.delim.raw five times
read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ","),
times = 5
)
## Unit: milliseconds
## expr
## read.delim("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## read.delim.raw("./RInputFiles/mortgage-sample.csv", header = FALSE, sep = ",")
## min lq mean median uq max neval cld
## 342.44097 443.7392 456.9239 486.7489 503.6207 508.0696 5 b
## 91.95404 103.7521 133.6853 114.5944 120.4411 237.6851 5 a
# Read mortgage-sample.csv as a raw vector
raw_file_content <- readAsRaw("./RInputFiles/mortgage-sample.csv")
# Convert the raw vector contents to a matrix
mort_mat <- mstrsplit(raw_file_content, sep = ",", type = "integer", skip = 1)
# Look at the first 6 rows
head(mort_mat)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 566 1 1 3 1 2 4 3 9 2 4 1
## [2,] 1 116 1 3 2 1 2 4 5 9 1 4 1
## [3,] 1 239 1 2 2 3 8 4 5 5 1 2 1
## [4,] 1 62 1 2 3 3 2 4 5 9 2 4 1
## [5,] 1 106 1 2 3 3 2 4 9 9 3 4 1
## [6,] 1 759 1 3 3 2 2 4 9 9 1 2 2
## [,14] [,15] [,16]
## [1,] 3 2010 1
## [2,] 3 2008 1
## [3,] 4 2014 0
## [4,] 4 2009 1
## [5,] 4 2013 1
## [6,] 4 2010 1
# Convert the raw file contents to a data.frame
mort_df <- dstrsplit(raw_file_content, sep = ",", col_types = rep("integer", 16), skip = 1)
# Look at the first 6 rows
head(mort_df)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 1 566 1 1 3 1 2 4 3 9 2 4 1 3 2010 1
## 2 1 116 1 3 2 1 2 4 5 9 1 4 1 3 2008 1
## 3 1 239 1 2 2 3 8 4 5 5 1 2 1 4 2014 0
## 4 1 62 1 2 3 3 2 4 5 9 2 4 1 4 2009 1
## 5 1 106 1 2 3 3 2 4 9 9 3 4 1 4 2013 1
## 6 1 759 1 3 3 2 2 4 9 9 1 2 2 4 2010 1
# We have created a file connection fc to the "mortgage-sample.csv" file and read in the first line to get rid of the header.
# Define the function to apply to each chunk
make_table <- function(chunk) {
# Read each chunk as a matrix
x <- mstrsplit(chunk, type = "integer", sep = ",")
# Create a table of the number of borrowers (column 3) for each chunk
table(x[, 3])
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
(col_names <- readLines(fc, n = 1))
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
(col_names <- lapply(str_split(col_names, '\\",\\"'), FUN=function(x) { str_replace(x, '\\"', '') })[[1]])
## [1] "enterprise" "record_number"
## [3] "msa" "perc_minority"
## [5] "tract_income_ratio" "borrower_income_ratio"
## [7] "loan_purpose" "federal_guarantee"
## [9] "borrower_race" "co_borrower_race"
## [11] "borrower_gender" "co_borrower_gender"
## [13] "num_units" "affordability"
## [15] "year" "type"
# Read the data in chunks
counts <- chunk.apply(fc, make_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Print counts
counts
## 0 1
## [1,] 309 2401
## [2,] 289 2422
## [3,] 266 2444
## [4,] 300 2410
## [5,] 279 2431
## [6,] 310 2400
## [7,] 274 2436
## [8,] 283 2428
## [9,] 259 2452
## [10,] 287 2423
## [11,] 288 2423
## [12,] 283 2428
## [13,] 271 2439
## [14,] 299 2411
## [15,] 294 2416
## [16,] 305 2405
## [17,] 280 2431
## [18,] 275 2435
## [19,] 303 2407
## [20,] 279 2431
## [21,] 296 2414
## [22,] 294 2417
## [23,] 288 2424
## [24,] 264 2446
## [25,] 292 2418
## [26,] 228 2013
# Sum up the chunks
colSums(counts)
## 0 1
## 7395 62605
msa_map <- c("rural", "urban")
# Define the function to apply to each chunk
make_msa_table <- function(chunk) {
# Read each chunk as a data frame
x <- dstrsplit(chunk, col_types = rep("integer", length(col_names)), sep = ",")
# Set the column names of the data frame that's been read
colnames(x) <- col_names
# Create new column, msa_pretty, with a string description of where the borrower lives
x$msa_pretty <- msa_map[x$msa + 1]
# Create a table from the msa_pretty column
table(x$msa_pretty)
}
# Create a file connection to mortgage-sample.csv
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
# Read the first line to get rid of the header
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Read the data in chunks
counts <- chunk.apply(fc, make_msa_table, CH.MAX.SIZE = 1e5)
# Close the file connection
close(fc)
# Aggregate the counts as before
colSums(counts)
## rural urban
## 7395 62605
iotools_read_fun <- function(parallel) {
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
chunk.apply(fc, make_msa_table,
CH.MAX.SIZE = 1e5, parallel = parallel)
close(fc)
}
# Benchmark the new function
microbenchmark(
# Use one process
iotools_read_fun(1),
# Use three processes
iotools_read_fun(3),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max
## iotools_read_fun(1) 114.2869 120.8175 126.4070 125.6258 129.6085 143.5224
## iotools_read_fun(3) 111.1809 118.8315 124.6302 124.5232 129.2364 142.1210
## neval cld
## 20 a
## 20 a
Chapter 4 - Case Study: Preliminary Analysis of Housing Data
Overview of types of analysis for this chapter:
Are the data missing at random?
Analyzing the Housing Data:
Borrower Lending Trends: City vs. Rural:
Wrap up:
Example code includes:
# Create a table of borrower_race column
race_table <- bigtable(mort, "borrower_race")
# Rename the elements
names(race_table) <- race_cat[as.numeric(names(race_table))]
# Find the proportion
race_table[1:7] / sum(race_table[1:7])
## Native Am Asian Black Pacific Is White Two or More
## 0.002330129 0.072315464 0.032915105 0.003177448 0.814828092 0.008603552
## Hispanic
## 0.065830210
mort_names <- col_names
# Create table of the borrower_race
race_table_chunks <- chunk.apply(
"./RInputFiles/mortgage-sample.csv", function(chunk) {
x <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(x) <- mort_names
table(x[, "borrower_race"])
}, CH.MAX.SIZE = 1e5)
# Add up the columns
race_table <- colSums(race_table_chunks)
# Find the proportion
borrower_proportion <- race_table[1:7] / sum(race_table[1:7])
pop_proportion <- c(0.009, 0.048, 0.126, 0.002, 0.724, 0.029, 0.163)
names(pop_proportion) <- race_cat[1:7]
# Create the matrix
matrix(c(pop_proportion, borrower_proportion), byrow = TRUE, nrow = 2,
dimnames = list(c("Population Proportion", "Borrower Proportion"), race_cat[1:7]))
## Native Am Asian Black Pacific Is
## Population Proportion 0.009000000 0.04800000 0.12600000 0.002000000
## Borrower Proportion 0.002330129 0.07231546 0.03291511 0.003177448
## White Two or More Hispanic
## Population Proportion 0.7240000 0.029000000 0.16300000
## Borrower Proportion 0.8148281 0.008603552 0.06583021
# Create a variable indicating if borrower_race is missing in the mortgage data
borrower_race_ind <- mort[, "borrower_race"] == 9
# Create a factor variable indicating the affordability
affordability_factor <- factor(mort[, "affordability"])
# Perform a logistic regression
summary(glm(borrower_race_ind ~ affordability_factor, family = binomial))
## Warning: closing unused connection 5 (./RInputFiles/mortgage-sample.csv)
##
## Call:
## glm(formula = borrower_race_ind ~ affordability_factor, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5969 -0.5016 -0.5016 -0.5016 2.0867
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7478 0.1376 -12.701 <2e-16 ***
## affordability_factor1 -0.2241 0.1536 -1.459 0.1447
## affordability_factor2 -0.3090 0.1609 -1.920 0.0548 .
## affordability_factor3 -0.2094 0.1446 -1.448 0.1476
## affordability_factor4 -0.2619 0.1383 -1.894 0.0582 .
## affordability_factor9 0.1131 0.1413 0.800 0.4235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 52279 on 69999 degrees of freedom
## Residual deviance: 52166 on 69994 degrees of freedom
## AIC: 52178
##
## Number of Fisher Scoring iterations: 4
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("borrower_race", "year"))
}
# Import data using chunk.apply
race_year_table <- chunk.apply(fc, make_table)
# Close connection
close(fc)
# Cast it to a data frame
rydf <- as.data.frame(race_year_table)
# Create a new column Race with race/ethnicity
rydf$Race <- race_cat
# Note: We removed the row corresponding to "Not Avail".
# View rydf
rydf <-
rydf %>%
filter(Race !="Not Avail")
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# View pop_proportion
pop_proportion
## Native Am Asian Black Pacific Is White Two or More
## 0.009 0.048 0.126 0.002 0.724 0.029
## Hispanic
## 0.163
# Gather on all variables except Race
rydfl <- gather(rydf, Year, Count, -Race)
# Create a new adjusted count variable
rydfl$Adjusted_Count <- rydfl$Count / pop_proportion[rydfl$Race]
# Plot
ggplot(rydfl, aes(x = Year, y = Adjusted_Count, group = Race, color = Race)) +
geom_line()
# View rydf
rydf
## 2008 2009 2010 2011 2012 2013 2014 2015 Race
## 1 11 18 13 16 15 12 29 29 Native Am
## 2 384 583 603 568 770 673 369 488 Asian
## 3 363 320 209 204 258 312 185 169 Black
## 4 33 38 21 13 28 22 17 23 Pacific Is
## 5 5552 7739 6301 5746 8192 7535 4110 4831 White
## 6 43 85 65 58 89 78 46 64 Two or More
## 7 577 563 384 378 574 613 439 512 Hispanic
# Normalize the columns
for (i in seq_len(nrow(rydf))) {
rydf[i, 1:8] <- rydf[i, 1:8] / rydf[i, 1]
}
# Convert the data to long format
rydf_long <- gather(rydf, Year, Proportion, -Race)
# Plot
ggplot(rydf_long, aes(x = Year, y = Proportion, group = Race, color = Race)) +
geom_line()
# Open a connection to the file and skip the header
fc <- file("./RInputFiles/mortgage-sample.csv", "rb")
readLines(fc, n = 1)
## [1] "\"enterprise\",\"record_number\",\"msa\",\"perc_minority\",\"tract_income_ratio\",\"borrower_income_ratio\",\"loan_purpose\",\"federal_guarantee\",\"borrower_race\",\"co_borrower_race\",\"borrower_gender\",\"co_borrower_gender\",\"num_units\",\"affordability\",\"year\",\"type\""
# Create a function to read chunks
make_table <- function(chunk) {
# Create a matrix
m <- mstrsplit(chunk, sep = ",", type = "integer")
colnames(m) <- mort_names
# Create the output table
bigtable(m, c("msa", "year"))
}
# Import data using chunk.apply
msa_year_table <- chunk.apply(fc, make_table)
# Close connection
close(fc)
# Convert to a data frame
df_msa <- as.data.frame(msa_year_table)
# Rename columns
df_msa$MSA <- c("rural", "city")
# Gather on all columns except Year
df_msa_long <- gather(df_msa, Year, Count, -MSA)
# Plot
ggplot(df_msa_long, aes(x = Year, y = Count, group = MSA, color = MSA)) +
geom_line()
# Tabulate borrower_income_ratio and federal_guarantee
ir_by_fg <- bigtable(mort, c("borrower_income_ratio", "federal_guarantee"))
# Label the columns and rows of the table
income_cat <- c('0 <= 50', '50 < 80', '> 80', 'Not Applicable')
guarantee_cat <- c('FHA/VA', 'RHS', 'HECM', 'No Guarantee')
dimnames(ir_by_fg) <- list(income_cat, guarantee_cat)
# For each row in ir_by_fg, divide by the sum of the row
for (i in seq_len(nrow(ir_by_fg))) {
ir_by_fg[i, ] = ir_by_fg[i, ] / sum(ir_by_fg[i, ])
}
# Print
ir_by_fg
## FHA/VA RHS HECM No Guarantee
## 0 <= 50 0.008944544 0.0014636526 0.0443974630 0.9451943
## 50 < 80 0.005977548 0.0024055985 0.0026971862 0.9889197
## > 80 0.001113022 0.0002428412 0.0006475766 0.9979966
## Not Applicable 0.023676880 0.0013927577 0.0487465181 0.9261838
# Quirky fix so that the files can be used again later
rm(mort)
rm(x)
rm(first_three)
rm(first_three_2)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1883739 100.7 3205452 171.2 3205452 171.2
## Vcells 12877108 98.3 20701811 158.0 17923360 136.8
Chapter 1 - Downloading Files and Using API Clients
Introduction: Working with Web Data in R:
Understanding Application Programming Interfaces (API) - automatically handling data changes:
Access tokens and API:
Example code includes:
# Here are the URLs! As you can see they're just normal strings
csv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1561/datasets/chickwts.csv"
tsv_url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_3026/datasets/tsv_data.tsv"
# Read a file in from the CSV URL and assign it to csv_data
csv_data <- read.csv(csv_url)
# Read a file in from the TSV URL and assign it to tsv_data
tsv_data <- read.delim(tsv_url)
# Examine the objects with head()
head(csv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
head(tsv_data)
## weight feed
## 1 179 horsebean
## 2 160 horsebean
## 3 136 horsebean
## 4 227 horsebean
## 5 217 horsebean
## 6 168 horsebean
# Download the file with download.file()
download.file(url = csv_url, destfile = "./RInputFiles/feed_data.csv")
# Read it in with read.csv()
csv_data <- read.csv("./RInputFiles/feed_data.csv")
# Add a new column: square_weight
csv_data$square_weight <- csv_data$weight ** 2
# Save it to disk with saveRDS()
saveRDS(csv_data, "./RInputFiles/modified_feed_data.RDS")
# Read it back in with readRDS()
modified_feed_data <- readRDS("./RInputFiles/modified_feed_data.RDS")
# Examine modified_feed_data
str(modified_feed_data)
## 'data.frame': 71 obs. of 3 variables:
## $ weight : int 179 160 136 227 217 168 108 124 143 140 ...
## $ feed : Factor w/ 6 levels "casein","horsebean",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ square_weight: num 32041 25600 18496 51529 47089 ...
# Load pageviews
# library(pageviews)
# Get the pageviews for "Hadley Wickham"
hadley_pageviews <- pageviews::article_pageviews(project = "en.wikipedia", "Hadley Wickham")
# Examine the resulting object
str(hadley_pageviews)
## 'data.frame': 1 obs. of 8 variables:
## $ project : chr "wikipedia"
## $ language : chr "en"
## $ article : chr "Hadley_Wickham"
## $ access : chr "all-access"
## $ agent : chr "all-agents"
## $ granularity: chr "daily"
## $ date : POSIXct, format: "2015-10-01"
## $ views : num 53
# Load birdnik
# library(birdnik)
# Get the word frequency for "vector", using api_key to access it
# vector_frequency <- word_frequency(api_key, "vector")
Chapter 2 - Using httr to interact with API Directly
GET and POST requests in theory - https and web requests in theory:
Graceful httr - code that responds appropriately and constructs its own url:
Respectful API Usage - usage that works for the API owners as well as the clients:
Example code includes:
# Load the httr package
library(httr)
# Make a GET request to http://httpbin.org/get
get_result <- GET("http://httpbin.org/get")
# Print it to inspect it
# get_result
# Make a POST request to http://httpbin.org/post with the body "this is a test"
# post_result <- POST(url="http://httpbin.org/post", body="this is a test")
# Print it to inspect it
# post_result
url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/Hadley_Wickham/daily/20170101/20170102"
# Make a GET request to url and save the results
pageview_response <- GET(url)
# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)
# Examine the results with str()
str(pageview_data)
## List of 1
## $ items:List of 2
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010100"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 45
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010200"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 86
fake_url <- "http://google.com/fakepagethatdoesnotexist"
# Make the GET request
request_result <- GET(fake_url)
# Check request_result
if(http_error(request_result)){
warning("The request failed")
} else {
content(request_result)
}
## Warning: The request failed
# Construct a directory-based API URL to `http://swapi.co/api`,
# looking for person `1` in `people`
directory_url <- paste("http://swapi.co/api", "people", 1, sep = "/")
# Make a GET call with it
result <- GET(directory_url)
# Create list with nationality and country elements
query_params <- list(nationality = "americans",
country = "antigua")
# Make parameter-based call to httpbin, with query_params
parameter_response <- GET("https://httpbin.org/get", query = query_params)
# Print parameter_response
parameter_response
## Response [https://httpbin.org/get?nationality=americans&country=antigua]
## Date: 2018-02-16 13:02
## Status: 200
## Content-Type: application/json
## Size: 425 B
## {
## "args": {
## "country": "antigua",
## "nationality": "americans"
## },
## "headers": {
## "Accept": "application/json, text/xml, application/xml, */*",
## "Accept-Encoding": "gzip, deflate",
## "Connection": "close",
## "Host": "httpbin.org",
## ...
# Do not change the url
# url <- "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/Aaron_Halfaker/daily/2015100100/2015103100"
# Add the email address and the test sentence inside user_agent()
# server_response <- GET(url, user_agent("my@email.address this is a test"))
# Construct a vector of 2 URLs
urls <- c("http://fakeurl.com/api/1.0/", "http://fakeurl.com/api/2.0/")
for(url in urls){
# Send a GET request to url
result <- GET(url)
# Delay for 5 seconds between requests
Sys.sleep(1)
}
get_pageviews <- function(article_title){
url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents", article_title, "daily/2015100100/2015103100", sep = "/")
response <- GET(url, user_agent("my@email.com this is a test"))
if(http_error(response)){
stop("the request failed" )
} else {
result <- content(response)
return(result)
}
}
Chapter 3 - Handling JSON and XML
JSON is a dictionary-like format (plain text) foe sending data on the internet:
Manipulating JSON - lists are the natural R hierarchy for JSON:
XML Structure - plain text like JSON, but with a very different structure:
XPATH - language for specifying nodes in an XML document:
Example code includes:
rev_history <- function(title, format = "json"){
if (title != "Hadley Wickham") {
stop('rev_history() only works for `title = "Hadley Wickham"`')
}
if (format == "json"){
resp <- readRDS("had_rev_json.rds")
} else if (format == "xml"){
resp <- readRDS("had_rev_xml.rds")
} else {
stop('Invalid format supplied, try "json" or "xml"')
}
resp
}
test_json <- "{\"continue\":{\"rvcontinue\":\"20150528042700|664370232\",\"continue\":\"||\"},\"query\":{\"pages\":{\"41916270\":{\"pageid\":41916270,\"ns\":0,\"title\":\"Hadley Wickham\",\"revisions\":[{\"user\":\"214.28.226.251\",\"anon\":\"\",\"timestamp\":\"2015-01-14T17:12:45Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Mary Helen Wickham III''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"73.183.151.193\",\"anon\":\"\",\"timestamp\":\"2015-01-15T15:49:34Z\",\"comment\":\"\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"FeanorStar7\",\"timestamp\":\"2015-01-24T16:34:31Z\",\"comment\":\"/* External links */ add LCCN and cats\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"KasparBot\",\"timestamp\":\"2015-04-26T19:18:17Z\",\"comment\":\"authority control moved to wikidata\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"},{\"user\":\"Spkal\",\"timestamp\":\"2015-05-06T18:24:57Z\",\"comment\":\"/* Bibliography */ Added his new book, R Packages\",\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"'''Hadley Wickham''' is a [[statistician]] from [[New Zealand]] who is currently Chief Scientist at [[RStudio]]<ref>{{cite web|url=http://washstat.org/wss1310.shtml |title=Washington Statistical Society October 2013 Newsletter |publisher=Washstat.org |date= |accessdate=2014-02-12}}</ref><ref>{{cite web|url=http://news.idg.no/cw/art.cfm?id=F66B12BB-D13E-94B0-DAA22F5AB01BEFE7 |title=60+ R resources to improve your data skills ( - Software ) |publisher=News.idg.no |date= |accessdate=2014-02-12}}</ref> and an [[Professors_in_the_United_States#Adjunct_professor|adjunct]] [[Assistant Professor]] of statistics at [[Rice University]].<ref name=\\\"about\\\">{{cite web|url=http://www.rstudio.com/about/ |title=About - RStudio |accessdate=2014-08-13}}</ref> He is best known for his development of open-source statistical analysis software packages for [[R (programming language)]] that implement logics of [[data visualisation]] and data transformation. Wickham completed his undergraduate studies at the [[University of Auckland]] and his PhD at [[Iowa State University]] under the supervision of Di Cook and Heike Hoffman.<ref>{{cite web|URL=http://blog.revolutionanalytics.com/2010/09/the-r-files-hadley-wickham.html |title= The R-Files: Hadley Wickham}}</ref> In 2006 he was awarded the [[John_Chambers_(statistician)|John Chambers]] Award for Statistical Computing for his work developing tools for data reshaping and visualisation.<ref>{{cite web|url=http://stat-computing.org/awards/jmc/winners.html |title=John Chambers Award Past winners|publisher=ASA Sections on Statistical Computing, Statistical Graphics,|date= |accessdate=2014-08-12}}</ref>\\n\\nHe is a prominent and active member of the [[R (programming language)|R]] user community and has developed several notable and widely used packages including [[ggplot2]], plyr, dplyr, and reshape2.<ref name=\\\"about\\\" /><ref>{{cite web|url=http://www.r-statistics.com/2013/06/top-100-r-packages-for-2013-jan-may/ |title=Top 100 R Packages for 2013 (Jan-May)! |publisher=R-statistics blog |date= |accessdate=2014-08-12}}</ref>\"}]}}}}"
# Get revision history for "Hadley Wickham"
resp_json <- rev_history("Hadley Wickham")
# Check http_type() of resp_json
http_type(resp_json)
# Examine returned text with content()
content(resp_json, as="text")
# Parse response with content()
content(resp_json, as="parsed")
# Parse returned text with fromJSON()
library(jsonlite)
fromJSON(content(resp_json, as="text"))
# Load rlist
library(rlist)
# Examine output of this code
str(content(resp_json), max.level = 4)
# Store revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract the user element
user_time <- list.select(revs, user, timestamp)
# Print user_time
user_time
# Stack to turn into a data frame
list.stack(user_time)
# Load dplyr
library(dplyr)
# Pull out revision list
revs <- content(resp_json)$query$pages$`41916270`$revisions
# Extract user and timestamp
revs %>%
bind_rows() %>%
select(user, timestamp)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
str(rev_xml)
# Load xml2
library(xml2)
# Get XML revision history
resp_xml <- rev_history("Hadley Wickham", format = "xml")
# Check response is XML
http_type(resp_xml)
# Examine returned text with content()
rev_text <- content(resp_xml, as="text")
rev_text
# Turn rev_text into an XML document
rev_xml <- read_xml(rev_text)
# Examine the structure of rev_xml
xml_structure(rev_xml)
# Find all nodes using XPATH "/api/query/pages/page/revisions/rev"
xml_find_all(rev_xml, "/api/query/pages/page/revisions/rev")
# Find all rev nodes anywhere in document
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Use xml_text() to get text from rev_nodes
xml_text(rev_nodes)
# All rev nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# The first rev node
first_rev_node <- xml_find_first(rev_xml, "//rev")
# Find all attributes with xml_attrs()
xml_attrs(first_rev_node)
# Find user attribute with xml_attr()
xml_attr(first_rev_node, attr="user")
# Find user attribute for all rev nodes
xml_attr(rev_nodes, attr="user")
# Find anon attribute for all rev nodes
xml_attr(rev_nodes, attr="anon")
get_revision_history <- function(article_title){
# Get raw revision response
rev_resp <- rev_history(article_title, format = "xml")
# Turn the content() of rev_resp into XML
rev_xml <- read_xml(content(rev_resp, "text"))
# Find revision nodes
rev_nodes <- xml_find_all(rev_xml, "//rev")
# Parse out usernames
user <- xml_attr(rev_nodes, attr="user")
# Parse out timestamps
timestamp <- readr::parse_datetime(xml_attr(rev_nodes, "timestamp"))
# Parse out content
content <- xml_text(rev_nodes)
# Return data frame
data.frame(user = user,
timestamp = timestamp,
content = substr(content, 1, 40))
}
# Call function for "Hadley Wickham"
get_revision_history(article_title = "Hadley Wickham")
Chapter 4 - Web Scraping with XPATH
Web scraping 101 - sometimes a website does not have an API, so a different approach is required:
HTML structure - basically, content within tags, much like XML:
This is a test
requests that “This is a test” be available in paragraph formReformatting data (especially to a rectangular format such as a data frame):
Example code includes:
# Load rvest
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
test_node_xpath <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"vcard\", \" \" ))]"
# Use html_node() to grab the node with the XPATH stored as `test_node_xpath`
node <- html_node(x = test_xml, xpath = test_node_xpath)
# Print the first element of the result
node[1]
## $node
## <pointer: 0x000000000b91bb80>
# The first thing we'll grab is a name, from the first element of the previously extracted table (now stored as table_element)
table_element <- node
# Extract the name of table_element
element_name <- html_name(table_element)
# Print the name
element_name
## [1] "table"
second_xpath_val <- "//*[contains(concat( \" \", @class, \" \" ), concat( \" \", \"fn\", \" \" ))]"
# Extract the element of table_element referred to by second_xpath_val and store it as page_name
page_name <- html_node(x = table_element, xpath = second_xpath_val)
# Extract the text from page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Turn table_element into a data frame and assign it to wiki_table
wiki_table <- html_table(table_element)
# Print wiki_table
wiki_table
## Hadley Wickham
## 1
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## 11
## Hadley Wickham
## 1
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
## 11
# Rename the columns of wiki_table
colnames(wiki_table) <- c("key", "value")
# Remove the empty row from wiki_table
cleaned_table <- subset(wiki_table, !(key == ""))
# Print cleaned_table
cleaned_table
## key
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
Chapter 5 - CSS Web Scraping and Final Case Study
CSS (cascading style sheets) web scraping in theory:
Final case study: Introduction:
Wrap up:
Example code includes:
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
# Print test_xml
test_xml
## {xml_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-sub ...
# Select the table elements
html_nodes(test_xml, css = "table")
## {xml_nodeset (2)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
## [2] <table class="nowraplinks hlist navbox-inner" style="border-spacing: ...
# Select elements with class = "infobox"
html_nodes(test_xml, css = ".infobox")
## {xml_nodeset (1)}
## [1] <table class="infobox biography vcard" style="width:22em">\n<tr>\n<t ...
# Select elements with id = "firstHeading"
html_nodes(test_xml, css = "#firstHeading")
## {xml_nodeset (1)}
## [1] <h1 id="firstHeading" class="firstHeading" lang="en">Hadley Wickham< ...
# Extract element with class infobox
infobox_element <- html_nodes(test_xml, css = ".infobox")
# Get tag name of infobox_element
element_name <- html_name(infobox_element)
# Print element_name
element_name
## [1] "table"
# Extract element with class fn
page_name <- html_node(x = infobox_element, css=".fn")
# Get contents of page_name
page_title <- html_text(page_name)
# Print page_title
page_title
## [1] "Hadley Wickham"
# Load httr
library(httr)
# The API url
base_url <- "https://en.wikipedia.org/w/api.php"
# Set query parameters
query_params <- list(action="parse",
page="Hadley Wickham",
format="xml")
# Get data from API
resp <- GET(url = "https://en.wikipedia.org/w/api.php", query = query_params)
# Parse response
resp_xml <- content(resp)
# Load rvest
library(rvest)
# Read page contents as HTML
page_html <- read_html(xml_text(resp_xml))
# Extract infobox element
infobox_element <- html_node(page_html, css=".infobox")
# Extract page name element from infobox
page_name <- html_node(infobox_element, css=".fn")
# Extract page name as text
page_title <- html_text(page_name)
# Your code from earlier exercises
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !key == "")
# Create a dataframe for full name
name_df <- data.frame(key = "Full name", value = page_title)
# Combine name_df with cleaned_table
wiki_table2 <- rbind(name_df, cleaned_table)
# Print wiki_table
wiki_table2
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
library(httr)
library(rvest)
library(xml2)
get_infobox <- function(title){
base_url <- "https://en.wikipedia.org/w/api.php"
# Change "Hadley Wickham" to title
query_params <- list(action = "parse",
page = title,
format = "xml")
resp <- GET(url = base_url, query = query_params)
resp_xml <- content(resp)
page_html <- read_html(xml_text(resp_xml))
infobox_element <- html_node(x = page_html, css =".infobox")
page_name <- html_node(x = infobox_element, css = ".fn")
page_title <- html_text(page_name)
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !wiki_table$key == "")
name_df <- data.frame(key = "Full name", value = page_title)
wiki_table <- rbind(name_df, cleaned_table)
wiki_table
}
# Test get_infobox with "Hadley Wickham"
get_infobox(title = "Hadley Wickham")
## key
## 1 Full name
## 2 Residence
## 3 Alma mater
## 4 Known for
## 5 Awards
## 6 Scientific career
## 7 Fields
## 8 Thesis
## 9 Doctoral advisors
## 10 Doctoral students
## value
## 1 Hadley Wickham
## 2 United States
## 3 Iowa State University, University of Auckland
## 4 R programming language packages
## 5 John Chambers Award (2006)\nFellow of the American Statistical Association (2015)
## 6 Scientific career
## 7 Statistics\nData science\nR (programming language)
## 8 Practical tools for exploring data and models (2008)
## 9 Di Cook\nHeike Hofmann
## 10 Garrett Grolemund
# Try get_infobox with "Ross Ihaka"
get_infobox(title = "Ross Ihaka")
## key
## 1 Full name
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Residence
## 4 Alma mater
## 5 Known for
## 6 Awards
## 7 Scientific career
## 8 Fields
## 9 Institutions
## 10 Thesis
## 11 Doctoral advisor
## value
## 1 Ross Ihaka
## 2 Ihaka at the 2010 New Zealand Open Source Awards
## 3 Auckland, New Zealand
## 4 University of AucklandUniversity of California, Berkeley
## 5 R programming language
## 6 Pickering Medal (2008)
## 7 Scientific career
## 8 Statistical Computing
## 9 University of Auckland
## 10 Ruaumoko (1985)
## 11 David R. Brillinger
# Try get_infobox with "Grace Hopper"
get_infobox(title = "Grace Hopper")
## key
## 1 Full name
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 Nickname(s)
## 4 Born
## 5 Died
## 6 Place of burial
## 7 Allegiance
## 8 Service/branch
## 9 Years of service
## 10 Rank
## 11 Awards
## value
## 1 Grace Murray Hopper
## 2 Rear Admiral Grace M. Hopper, 1984
## 3 "Amazing Grace"
## 4 (1906-12-09)December 9, 1906\nNew York City, New York, U.S.
## 5 January 1, 1992(1992-01-01) (aged 85)Arlington, Virginia, U.S.
## 6 Arlington National Cemetery
## 7 United States of America
## 8 United States Navy
## 9 1943–1966, 1967–1971, 1972–1986
## 10 Rear admiral (lower half)
## 11 Defense Distinguished Service Medal Legion of Merit Meritorious Service Medal American Campaign Medal World War II Victory Medal National Defense Service Medal Armed Forces Reserve Medal with two Hourglass Devices Naval Reserve Medal Presidential Medal of Freedom (posthumous)
Chapter 1 - Basic plotting with lattice
Introduction - general objectives:
Optional arguments:
Box and whisker plots and reordering elements:
Example code includes:
data(airquality)
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Load the lattice package
library(lattice)
# Create the histogram
histogram(~ Ozone, data = airquality)
# Create the histogram
histogram(~ Ozone, data = airquality,
# Specify number of bins
nint = 15,
# Specify quantity displayed on y-axis
type = "count")
# Create the scatter plot
xyplot(Ozone ~ Solar.R, data = airquality)
# Create scatterplot
xyplot(Ozone ~ Temp, data = airquality,
# Add main label
main = "Environmental conditions in New York City (1973)",
# Add axis labels
ylab = "Ozone (ppb)",
xlab = "Temperature (Fahrenheit)")
# Create a density plot
densityplot(~ Ozone, data = airquality,
# Choose how raw data is shown
plot.points = "jitter")
data(USCancerRates, package="latticeExtra")
str(USCancerRates)
## 'data.frame': 3041 obs. of 8 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female: num 124 103 161 157 151 ...
## $ UCL95.female: num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
rn_USCR <- row.names(USCancerRates)
# Create reordered variable
library(dplyr)
USCancerRates <-
mutate(USCancerRates,
state.ordered = reorder(state, rate.female, median, na.rm = TRUE)
)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates)
# Create box and whisker plot
bwplot(state.ordered ~ rate.female, data = USCancerRates,
# Change whiskers extent
coef = 0)
Chapter 2 - Conditioning and the Formula Interface
Conditioning - identify sources of variability in the data by examining sub-groups:
Data summary and transformation - grouping:
Incorporating external data sources:
The trellis object - lattice creates trellis objects rather than directly creating plots (as in base R):
Example code includes:
# The airquality dataset has been pre-loaded
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Create a histogram
histogram(~ Ozone | factor(Month),
data = airquality,
# Define the layout
layout=c(2, 3),
# Change the x-axis label
xlab="Ozone (ppb)")
# USCancerRates has been pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 9 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered: Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Suppress data points
plot.points = FALSE,
# Add a reference line
ref=TRUE)
# Create a density plot
densityplot(~ rate.male + rate.female,
data = USCancerRates,
# Set value of 'outer'
outer=FALSE,
# Add x-axis label
xlab="Rate (per 100,000)",
# Add a legend
auto.key=TRUE,
plot.points = FALSE,
ref = TRUE)
xyplot(Ozone ~ Temp, airquality, groups = Month,
# Complete the legend spec
auto.key = list(space = "right",
title = "Month",
text = month.name[5:9]))
USCancerRates <- USCancerRates %>%
mutate(division=state.division[match(state, state.name)])
# Create 'division.ordered' by reordering levels
USCancerRates <-
mutate(USCancerRates,
division.ordered = reorder(division,
rate.male + rate.female,
mean, na.rm = TRUE))
# Create conditioned scatter plot
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
# Add reference grid
grid = TRUE,
# Add reference line
abline = c(0, 1))
# Levels of division.ordered
levels(USCancerRates$division.ordered)
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
# Specify the as.table argument
xyplot(rate.female ~ rate.male | division.ordered,
data = USCancerRates,
grid = TRUE, abline = c(0, 1),
as.table=TRUE)
# Create box-and-whisker plot
bwplot(division.ordered ~ rate.male + rate.female,
data = USCancerRates,
outer = TRUE,
# Add a label for the x-axis
xlab="Rate (per 100,000)",
# Add strip labels
strip = strip.custom(factor.levels = c("Male", "Female")))
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Change names for the second dimension
dimnames(tplot)[[2]] <- c("Male", "Female")
# Update x-axis label and plot
update(tplot, xlab = "Rate")
# Create "trellis" object
tplot <-
densityplot(~ rate.male + rate.female | division.ordered,
data = USCancerRates, outer = TRUE,
plot.points = FALSE, as.table = TRUE)
# Inspect dimension
dim(tplot)
## [1] 9 2
dimnames(tplot)
## $division.ordered
## [1] "Mountain" "West North Central" "Pacific"
## [4] "Middle Atlantic" "New England" "East North Central"
## [7] "West South Central" "South Atlantic" "East South Central"
##
## [[2]]
## [1] "rate.male" "rate.female"
# Select subset retaining only last three divisions
tplot[7:9, ]
Chapter 3 - Controlling scales and graphical parameters
Combining scales:
Logarithmic scales:
Graphical parameters:
Using simpleTheme():
Example code includes:
# The lattice package and the USMortality dataset have been pre-loaded.
Status <- factor(c('Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural', 'Urban', 'Rural'), levels=c("Rural", "Urban")
)
Sex <- factor(c('Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female'), levels=c("Female", "Male")
)
Cause <- factor(c('Heart disease', 'Heart disease', 'Heart disease', 'Heart disease', 'Cancer', 'Cancer', 'Cancer', 'Cancer', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Lower respiratory', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Unintentional injuries', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Cerebrovascular diseases', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Alzheimers', 'Diabetes', 'Diabetes', 'Diabetes', 'Diabetes', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Flu and pneumonia', 'Suicide', 'Suicide', 'Suicide', 'Suicide', 'Nephritis', 'Nephritis', 'Nephritis', 'Nephritis'),
levels=c('Alzheimers', 'Cancer', 'Cerebrovascular diseases', 'Diabetes', 'Flu and pneumonia', 'Heart disease', 'Lower respiratory', 'Nephritis', 'Suicide', 'Unintentional injuries')
)
Rate <- c(210.2, 242.7, 132.5, 154.9, 195.9, 219.3, 140.2, 150.8, 44.5, 62.8, 36.5, 46.9, 49.6, 71.3, 24.7, 37.2, 36.1, 42.2, 34.9, 42.2, 19.4, 21.8, 25.5, 30.6, 24.9, 29.5, 17.1, 21.8, 17.7, 20.8, 12.9, 16.3, 19.2, 26.3, 5.3, 6.2, 15.7, 18.3, 10.7, 13.9)
SE <- c(0.2, 0.6, 0.2, 0.4, 0.2, 0.5, 0.2, 0.4, 0.1, 0.3, 0.1, 0.2, 0.1, 0.3, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.1, 0.1, 0.2, 0, 0.1, 0.1, 0.2, 0, 0.1)
USMortality <- data.frame(Status=Status, Sex=Sex, Cause=Cause, Rate=Rate, SE=SE)
# Specify upper bound to exclude Heart disease and Cancer
x_limits <- c(0, 100)
# Draw the plot
dotplot(Cause ~ Rate | Sex + Status, data = USMortality, as.table = TRUE,
xlim = x_limits)
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
scales = list(x = list(relation = "free",
# Specify limits for each panel
limits = list(c(0, 50), c(0, 80),
c(0, 50), c(0, 80) ))))
dotplot(Cause ~ Rate | Sex + Status, data = USMortality,
as.table = TRUE,
# Change the number of tick marks
scales = list(x = list(tick.number = 10,
# Show `Rate` labels on both bottom and top
alternating = 3,
# Rotate `Rate` labels by 90 degrees
rot = 90),
# Rotate `Cause` labels by 45 degrees
y = list(rot = 45)))
# Define at as 2^3 up to 2^8
x_ticks_at <- 2 ** (3:8)
dotplot(Cause ~ Rate | Sex, data = USMortality,
groups = Status, auto.key = list(columns = 2),
scales = list(x = list(log = 2,
# A numeric vector with
# values 2^3, 2^4, ..., 2^8
at = x_ticks_at,
# A character vector,
# "8" for 2^3, "16" for 2^4, etc.
labels = x_ticks_at)))
# Create the dot plot
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
scales = list(x = list(log = TRUE,
equispaced.log = FALSE)),
# Provide pch values for the two groups
pch = c(3, 1))
dotplot(Cause ~ Rate | Status, data = USMortality,
groups = Sex, auto.key = list(columns = 2),
par.settings = simpleTheme(pch = c(3, 1)),
scales = list(x = list(log = 2, equispaced.log = FALSE)))
# The WorldPhones matrix is already provided, with the first row removed so you only need consider consecutive years
data(WorldPhones)
WorldPhones <- WorldPhones[row.names(WorldPhones) != 1951, ]
WorldPhones
## N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
## 1956 60423 29990 4708 2568 2366 1411 733
## 1957 64721 32510 5230 2695 2526 1546 773
## 1958 68484 35218 6662 2845 2691 1663 836
## 1959 71799 37598 6856 3000 2868 1769 911
## 1960 76036 40341 8220 3145 3054 1905 1008
## 1961 79831 43173 9053 3338 3224 2005 1076
names(dimnames(WorldPhones)) <- c("Year", "Region")
# Transform matrix data to data frame
WorldPhonesDF <- as.data.frame(
# Intermediate step: convert to table
as.table(WorldPhones),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
# Log-transform the x-axis
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
# Set x-axis relation to "sliced"
relation = "sliced")))
# Load latticeExtra package for ggplot2like()
library(latticeExtra)
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked _by_ '.GlobalEnv':
##
## USCancerRates
## The following object is masked from 'package:ggplot2':
##
## layer
# Transform matrix data to data frame
names(dimnames(WorldPhones)) <- c("Year", "Region")
WorldPhonesDF <-
as.data.frame(as.table(WorldPhones[-1, ]),
responseName = "Phones")
# Create the dot plot
dotplot(Year ~ Phones | Region,
data = WorldPhonesDF,
as.table = TRUE,
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
# Fill in suitable value of par.settings
par.settings = ggplot2like(),
# Fill in suitable value of lattice.options
lattice.options = ggplot2like.opts())
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name[1:12])
# Create histogram of Ozone, conditioning on Month
histogram(~ Ozone | Month.Name,
data = airquality, as.table = TRUE,
# Set border to be transparent
border = "transparent",
# Set fill color to be mid-gray
col = "grey50")
# Create factor variable
airquality$Month.Name <-
factor(airquality$Month, levels = 1:12,
labels = month.name)
levels(airquality$Month.Name)
## [1] "January" "February" "March" "April" "May"
## [6] "June" "July" "August" "September" "October"
## [11] "November" "December"
# Drop empty levels
airquality$Month.Name <- droplevels(airquality$Month.Name)
levels(airquality$Month.Name)
## [1] "May" "June" "July" "August" "September"
# Obtain colors from RColorBrewer
library(RColorBrewer)
my.colors <- brewer.pal(n = 5, name = "Set1")
# Density plot of ozone concentration grouped by month
densityplot(~ Ozone, data = airquality, groups = Month.Name,
plot.points = FALSE,
auto.key = list(space = "right"),
# Fill in value of col
par.settings = simpleTheme(col = my.colors,
# Fill in value of lwd
lwd = 2))
Chapter 4 - Customizing plots using panel functions
Panel functions:
Prepanel Functions to control limits:
Optional arguments of default panel functions:
Example code includes:
panel.xyrug <- function(x, y, ...)
{
# Reproduce standard scatter plot
panel.xyplot(x, y, ...)
# Identify observations with x-value missing
x.missing <- is.na(x)
# Identify observations with y-value missing
y.missing <- is.na(y)
# Draw rugs along axes
panel.rug(x = x[y.missing], y = y[x.missing])
}
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
xyplot(Ozone ~ Solar.R | Month.Name, data = airquality,
panel = panel.xyrug, as.table = TRUE)
# Create factor variable with month names
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create box-and-whisker plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer=TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value")
# Create violin plot
bwplot(Month.Name ~ Ozone + Temp, airquality,
# Specify outer
outer = TRUE,
# Specify x-axis relation
scales = list(x = list(relation="free")),
# Specify layout
layout=c(2, 1),
# Specify x-axis label
xlab="Measured value",
# Replace default panel function
panel = panel.violin)
# Create panel function
panel.ss <- function(x, y, ...) {
# Call panel.smoothScatter()
panel.smoothScatter(x, y, ...)
# Call panel.loess()
panel.loess(x, y, col = "red")
# Call panel.abline()
panel.abline(0, 1)
}
# Create plot
xyplot(rate.female ~ rate.male, data = USCancerRates,
panel = panel.ss,
main = "County-wise deaths due to cancer")
## (loaded the KernSmooth namespace)
# Define prepanel function
prepanel.histdens.2 <- function(x, ...) {
h <- prepanel.default.histogram(x, ...)
d <- density(x, na.rm = TRUE)
list(xlim = quantile(x, c(0.005, 0.995), na.rm = TRUE),
# Calculate upper y-limit
ylim = c(0, max(d$y, h$ylim[2])))
}
panel.histdens <- function(x, ...) {
panel.histogram(x, ...)
panel.lines(density(x, na.rm = TRUE))
}
# Create a histogram of rate.male and rate.female
histogram(~ rate.male + rate.female,
data = USCancerRates, outer = TRUE,
type = "density", nint = 50,
border = "transparent", col = "lightblue",
# The panel function: panel.histdens
panel = panel.histdens,
# The prepanel function: prepanel.histdens.2
prepanel = prepanel.histdens.2,
# Ensure that the x-axis is log-transformed
# and has relation "sliced"
scales = list(x = list(log = TRUE,
equispaced.log = FALSE,
relation = "sliced")),
xlab = "Rate (per 100,000)")
# Create the box and whisker plot
bwplot(division.ordered ~ rate.male,
data = USCancerRates,
# Indicate median by line instead of dot
pch = "|",
# Include notches for confidence interval
notch = TRUE,
# The x-axis should plot log-transformed values
scales = list(x = list(log=TRUE, equispaced.log=FALSE)),
xlab = "Death Rate in Males (per 100,000)")
# Load the 'latticeExtra' package
library(latticeExtra)
# Create summary dataset
USCancerRates.state <-
with(USCancerRates, {
rmale <- tapply(rate.male, state, median, na.rm = TRUE)
rfemale <- tapply(rate.female, state, median, na.rm = TRUE)
data.frame(Rate = c(rmale, rfemale),
State = rep(names(rmale), 2),
Gender = rep(c("Male", "Female"),
each = length(rmale)))
})
# Reorder levels
library(dplyr)
USCancerRates.state <-
mutate(USCancerRates.state, State = reorder(State, Rate))
head(USCancerRates.state)
## Rate State Gender
## 1 286.00 Alabama Male
## 2 237.95 Alaska Male
## 3 209.30 Arizona Male
## 4 284.10 Arkansas Male
## 5 221.30 California Male
## 6 204.40 Colorado Male
# URLs for emojis
emoji.man <- "https://twemoji.maxcdn.com/72x72/1f468.png"
emoji.woman <- "https://twemoji.maxcdn.com/72x72/1f469.png"
# Create dotplot
# dotplot(State ~ Rate, data = USCancerRates.state,
# Specify grouping variable
# groups = Gender,
# Specify panel function
# panel = panel.xyimage,
# Specify emoji URLs
# pch = c(emoji.woman, emoji.man),
# Make symbols smaller
# cex = 0.75)
Chapter 5 - Extensions and the lattice ecosystem
New methods - lattice is used by many packages because it is highly extensible:
New high-level functions can be created:
Manipulation (extension) of trellis objects:
Example code includes:
# Use 'EuStockMarkets' time series data
data(EuStockMarkets)
str(EuStockMarkets)
## Time-Series [1:1860, 1:4] from 1991 to 1999: 1629 1614 1607 1621 1618 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
# Create time series plot
xyplot(EuStockMarkets,
# Plot all series together
superpose = TRUE,
# Split up the time axis into parts
cut = list(number = 3, overlap = 0.25))
# Create time series plot
xyplot(EuStockMarkets,
# Specify panel function
panel=panel.horizonplot,
# Specify prepanel function
prepanel=prepanel.horizonplot)
# Load required packages
library(maps)
# Create map object for US counties
county.map <- map("county", plot = FALSE, fill = TRUE,
# Specify projection
projection = "sinusoidal")
# Create choropleth map
row.names(USCancerRates) <- rn_USCR
mapplot(row.names(USCancerRates) ~ log10(rate.male) + log10(rate.female),
data = USCancerRates,
xlab = "", scales = list(draw = FALSE),
# Specify map
map = county.map)
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
## Warning in (function (x, y, map, breaks, colramp, exact = FALSE,
## lwd = 0.5, : 64 unmatched regions: alaska,nome, alaska,wade hampton,
## alaska,haines, alaska,....
# Create subset for Louisiana
LACancerRates1 <- filter(USCancerRates, state == "Louisiana")
str(LACancerRates1)
## 'data.frame': 64 obs. of 11 variables:
## $ rate.male : num 369 361 349 338 338 ...
## $ LCL95.male : num 316 289 302 308 303 ...
## $ UCL95.male : num 428 446 402 372 376 ...
## $ rate.female : num 162 193 215 194 192 ...
## $ LCL95.female : num 134 150 184 176 170 ...
## $ UCL95.female : num 196 246 250 215 218 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ county :Class 'AsIs' chr [1:64] "Richland Parish" "Madison Parish" "De Soto Parish" "St. Bernard Parish" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 46 46 46 46 46 46 46 46 46 46 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 7 7 7 7 7 7 7 7 7 7 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Reorder levels of county
LACancerRates2 <-
mutate(LACancerRates1,
county = reorder(county, rate.male))
# Draw confidence intervals
segplot(county ~ LCL95.male + UCL95.male,
data = LACancerRates2,
# Add point estimates
centers = rate.male,
# Draw segments rather than bands
draw.bands = FALSE)
# The 'USCancerRates' dataset
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Load the 'hexbin' package
library(hexbin)
# Create hexbin plot
hexbinplot(rate.female ~ rate.male,
data = USCancerRates,
# Add a regression line
type = "r",
# function to transform counts
trans = sqrt,
# function to invert transformed counts
inv = function(x) x^2
)
# Load the 'directlabels' package
library(directlabels)
# Use the 'airquality' dataset
str(airquality)
## 'data.frame': 153 obs. of 7 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R : int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Month.Name: Factor w/ 12 levels "January","February",..: 5 5 5 5 5 5 5 5 5 5 ...
# Create factor variable
airquality$Month.Name <-
factor(month.name[airquality$Month], levels = month.name)
# Create density plot object
tplot2 <-
densityplot(~ Ozone + Temp, data = airquality,
# Variables should go in different panels
outer = TRUE,
# Specify grouping variable
groups = Month.Name,
# Suppress display of data points
plot.points = FALSE,
# Add reference line
ref = TRUE,
# Specify layout
layout = c(2, 1),
# Omit strip labels
strip = FALSE,
# Provide column-specific x-axis labels
xlab = c("Ozone (ppb)", "Temperature (F)"),
# Let panels have independent scales
scales = list(relation="free"))
# Produce plot with direct labels
direct.label(tplot2)
# 'USCancerRates' is pre-loaded
str(USCancerRates)
## 'data.frame': 3041 obs. of 11 variables:
## $ rate.male : num 364 346 341 336 330 ...
## $ LCL95.male : num 311 274 304 289 293 ...
## $ UCL95.male : num 423 431 381 389 371 ...
## $ rate.female : num 151 140 182 185 172 ...
## $ LCL95.female : num 124 103 161 157 151 ...
## $ UCL95.female : num 184 190 206 218 195 ...
## $ state : Factor w/ 49 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county :Class 'AsIs' chr [1:3041] "Pickens County" "Bullock County" "Russell County" "Barbour County" ...
## $ state.ordered : Factor w/ 49 levels "Utah","New Mexico",..: 25 25 25 25 25 25 25 25 25 25 ...
## ..- attr(*, "scores")= num [1:49(1d)] 166 166 145 169 159 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ division : Factor w/ 9 levels "New England",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ division.ordered: Factor w/ 9 levels "Mountain","West North Central",..: 9 9 9 9 9 9 9 9 9 9 ...
## ..- attr(*, "scores")= num [1:9(1d)] 417 416 446 466 433 ...
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr "New England" "Middle Atlantic" "South Atlantic" "East South Central" ...
# Create scatter plot
p <- xyplot(rate.female ~ rate.male, data = USCancerRates,
# Change plotting character
pch = 16,
# Make points semi-transparent
alpha = 0.25)
# Create layer with reference grid
l0 <- layer_(panel.grid())
# Create layer with reference line
l1 <- layer(panel.abline(0, 1))
# Create layer with regression fit
l2 <- layer(panel.smoother(x, y, method="lm"))
# Combine and plot
p + l0 + l1 + l2
Chapter 1 - R Time Series Visualization Tools
Refresher on xts and the plot() function:
Other useful visualizing functions:
Example code includes:
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
# data is a 504x4 xts object of Yahoo, Microsoft, Citigroup, and Dow
tmpData <- readr::read_delim("./RInputFiles/dataset_1_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## yahoo = col_double(),
## microsoft = col_double(),
## citigroup = col_double(),
## dow_chemical = col_double()
## )
data <- xts::xts(tmpData[, -1], order.by=as.POSIXct(tmpData$Index))
# Display the first few lines of the data
head(data)
## yahoo microsoft citigroup dow_chemical
## 2015-01-01 18:00:00 50.17 44.30501 53.45259 42.48209
## 2015-01-04 18:00:00 49.13 43.89759 51.76803 41.16821
## 2015-01-05 18:00:00 49.21 43.25329 49.94556 40.50662
## 2015-01-06 18:00:00 48.59 43.80284 50.40857 40.44139
## 2015-01-07 18:00:00 50.23 45.09144 51.16711 41.44776
## 2015-01-08 18:00:00 49.72 44.71244 50.02437 41.38253
# Display the column names of the data
colnames(data)
## [1] "yahoo" "microsoft" "citigroup" "dow_chemical"
# Plot yahoo data and add title
plot(data[, "yahoo"], main="yahoo")
# Replot yahoo data with labels for X and Y axes
plot(data[, "yahoo"], main="yahoo", xlab="date", ylab="price")
# Note that type="h" is for bars
# Plot the second time series and change title
plot(data[, 2], main="microsoft")
# Replot with same title, add subtitle, use bars
plot(data[, 2], main="microsoft", sub="Daily closing price since 2015", type="h")
# Change line color to red
lines(data[, 2], col="red")
# Plot two charts on same graphical window
par(mfrow = c(2, 1))
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
# Replot with reduced margin and character sizes
par(mfrow = c(2, 1), mex=0.6, cex=0.8)
plot(data[, 1], main="yahoo")
plot(data[, 2], main="microsoft")
par(mfrow = c(1, 1), mex=1, cex=1)
# Plot the "microsoft" series
plot(data[, "microsoft"], main="Stock prices since 2015")
# Add the "dow_chemical" series in red
lines(data[, "dow_chemical"], col="red")
# Add a Y axis on the right side of the chart
axis(side=4, at=pretty(data[, "dow_chemical"]))
# Add a legend in the bottom right corner
legend("bottomright", legend=c("microsoft", "dow_chemical"), col=c("black", "red"), lty=c(1, 1))
# Plot the "citigroup" time series
plot(data[, "citigroup"], main="Citigroup")
# Create vert_line to identify January 4th, 2016 in citigroup
vert_line <- which(index(data[, "citigroup"]) == as.POSIXct("2016-01-04"))
# Add a red vertical line using vert_line
abline(v = .index(data[, "citigroup"])[vert_line], col = "red")
# Create hori_line to identify average price of citigroup
hori_line <- mean(data[, "citigroup"])
# Add a blue horizontal line using hori_line
abline(h = hori_line, col = "blue")
# Create period to hold the 3 months of 2015
period <- c("2015-01/2015-03")
# Highlight the first three months of 2015
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period)
# Highlight the first three months of 2015 in light grey
PerformanceAnalytics::chart.TimeSeries(data[, "citigroup"], period.areas=period, period.color="lightgrey")
# Plot the microsoft series
plot(data[, "microsoft"], main="Dividend date and amount")
# Add the citigroup series
lines(data[, "citigroup"], col="orange", lwd=2)
# Add a new y axis for the citigroup series
axis(side=4, at=pretty(data[, "citigroup"]), col="orange")
micro_div_date <- "15 Nov. 2016"
citi_div_date <- "13 Nov. 2016"
micro_div_value <- "$0.39"
citi_div_value <- "$0.16"
# Same plot as the previous exercise
plot(data$microsoft, main = "Dividend date and amount")
lines(data$citigroup, col = "orange", lwd = 2)
axis(side = 4, at = pretty(data$citigroup), col = "orange")
# Create the two legend strings
micro <- paste0("Microsoft div. of ", micro_div_value," on ", micro_div_date)
citi <- paste0("Citigroup div. of ", citi_div_value," on ", citi_div_date)
# Create the legend in the bottom right corner
legend(x = "bottomright", legend = c(micro, citi), col = c("black", "orange"), lty = c(1, 1))
data_1_1_old <- data
Chapter 2 - Univariate Time Series
Univariate time series analysis - deals with only a single variable:
Other visualization tools:
Combining everything so far:
Example code includes:
tmpData <- readr::read_delim("./RInputFiles/dataset_2_1.csv", delim=" ")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## Apple = col_double()
## )
names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot Apple's stock price
plot(data[, "apple"], main="Apple stock price")
# Create a time series called rtn
rtn <- TTR::ROC(data[, "apple"])
# Plot Apple daily price and daily returns
par(mfrow=c(1, 2))
plot(data[, "apple"], main="Apple stock price")
plot(rtn)
par(mfrow=c(1, 1))
dim(rtn)
## [1] 522 1
rtn <- rtn[complete.cases(rtn), ]
dim(rtn)
## [1] 521 1
# Create a histogram of Apple stock returns
hist(rtn, main="Apple stock return distribution", probability=TRUE)
# Add a density line
lines(density(rtn[complete.cases(rtn), ]))
# Redraw a thicker, red density line
lines(density(rtn[complete.cases(rtn), ]), col="red", lwd=2)
rtnRaw <- as.double(rtn$apple)
# Draw box and whisker plot for the Apple returns
boxplot(rtnRaw)
# Draw a box and whisker plot of a normal distribution
boxplot(rnorm(1000))
# Redraw both plots on the same graphical window
par(mfrow=c(2, 1))
boxplot(rtnRaw, horizontal=TRUE)
boxplot(rnorm(1000), horizontal=TRUE)
par(mfrow=c(1, 1))
# Draw autocorrelation plot
acf(rtn, main="Apple return autocorrelation")
# Redraw with a maximum lag of 10
acf(rtn, main="Apple return autocorrelation", lag.max=10)
# Create q-q plot
qqnorm(rtn, main="Apple return QQ-plot")
# Add a red line showing normality
qqline(rtn, col="red")
par(mfrow=c(2, 2))
hist(rtn, probability=TRUE)
lines(density(rtn), col="red")
boxplot(rtnRaw)
acf(rtn)
qqnorm(rtn)
qqline(rtn, col="red")
par(mfrow=c(1, 1))
Chapter 3 - Multivariate Time Series
Dealing with higher dimensions - visualization challenges with larger numbers of series:
Multivariate time series:
Higher dimension time series:
Example code includes:
# You are provided with a dataset (portfolio) containing the weigths of stocks A (stocka) and B (stockb) in your portfolio for each month in 2016
stockA <- c(0.1, 0.4, 0.5, 0.5, 0.2, 0.3, 0.7, 0.8, 0.7, 0.2, 0.1, 0.2)
stockB <- c(0.9, 0.6, 0.5, 0.5, 0.8, 0.7, 0.3, 0.2, 0.3, 0.8, 0.9, 0.8)
pDates <- as.Date(c('2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01', '2016-05-01', '2016-06-01', '2016-07-01', '2016-08-01', '2016-09-01', '2016-10-01', '2016-11-01', '2016-12-01'))
portfolio <- xts(data.frame(stocka=stockA, stockb=stockB), order.by=pDates)
# Plot stacked barplot
barplot(portfolio)
# Plot grouped barplot
barplot(portfolio, beside=TRUE)
tmpData <- readr::read_delim("./RInputFiles/data_3_2.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## sp500 = col_double(),
## citigroup = col_double(),
## microsoft = col_double(),
## apple = col_double(),
## dowchemical = col_double(),
## yahoo = col_double()
## )
# names(tmpData) <- c("Index", "apple")
my_data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
citi <- as.numeric(my_data$citigroup)
sp500 <- as.numeric(my_data$sp500)
# Draw the scatterplot
plot(y=citi, x=sp500)
# Draw a regression line
abline(reg=lm(citi ~ sp500), col="red", lwd=2)
# my_data containing the returns for 5 stocks: ExxonMobile, Citigroup, Microsoft, Dow Chemical and Yahoo
# Create correlation matrix using Pearson method
cor(my_data)
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5097953 0.3743215 0.3576966 0.5217243 0.2900962
## citigroup 0.5097953 1.0000000 0.4841408 0.4291841 0.5085190 0.4029490
## microsoft 0.3743215 0.4841408 1.0000000 0.5133469 0.3954523 0.4329388
## apple 0.3576966 0.4291841 0.5133469 1.0000000 0.3627755 0.3413626
## dowchemical 0.5217243 0.5085190 0.3954523 0.3627755 1.0000000 0.2938749
## yahoo 0.2900962 0.4029490 0.4329388 0.3413626 0.2938749 1.0000000
# Create correlation matrix using Spearman method
cor(my_data, method="spearman")
## sp500 citigroup microsoft apple dowchemical yahoo
## sp500 1.0000000 0.5192579 0.4244237 0.3518853 0.5316235 0.3262037
## citigroup 0.5192579 1.0000000 0.4976477 0.4374850 0.5607511 0.3780730
## microsoft 0.4244237 0.4976477 1.0000000 0.5128477 0.4684114 0.4448179
## apple 0.3518853 0.4374850 0.5128477 1.0000000 0.3681791 0.3680715
## dowchemical 0.5316235 0.5607511 0.4684114 0.3681791 1.0000000 0.3464743
## yahoo 0.3262037 0.3780730 0.4448179 0.3680715 0.3464743 1.0000000
# Create scatterplot matrix
pairs(as.data.frame(my_data))
# Create upper panel scatterplot matrix
pairs(as.data.frame(my_data), lower.panel=NULL)
cor_mat <- cor(my_data)
# In this exercise, you will use the provided correlation matrix cor_mat
# Create correlation matrix
corrplot::corrplot(cor_mat)
# Create correlation matrix with numbers
corrplot::corrplot(cor_mat, method="number")
# Create correlation matrix with colors
corrplot::corrplot(cor_mat, method="color")
# Create upper triangle correlation matrix
corrplot::corrplot(cor_mat, method="number", type="upper")
# Draw heatmap of cor_mat
corrplot::corrplot(cor_mat, method="color")
# Draw upper heatmap
corrplot::corrplot(cor_mat, method="color", type="upper")
# Draw the upper heatmap with hclust
corrplot::corrplot(cor_mat, method="color", type="upper", order="hclust")
Chapter 4 - Case Study: Stock Picking for Portfolios
Case study presentation:
New stocks:
Course conclusion:
Example code includes:
# In this exercise, you are provided with a dataset data containing the value and the return of the portfolio over time, in value and return, respectively.
tmpData <- readr::read_delim("./RInputFiles/data_4_1.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## value = col_double(),
## return = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the portfolio value
plot(data$value, main="Portfolio Value")
# Plot the portfolio return
plot(data$return, main="Portfolio Return")
# Plot a histogram of portfolio return
hist(data$return, probability=TRUE)
# Add a density line
lines(density(data$return), col="red", lwd=2)
tmpPortfolioData <- data
# The new dataset data containing four new stocks is available in your workspace: Goldman Sachs (GS), Coca-Cola (KO), Walt Disney (DIS), Caterpillar (CAT)
tmpData <- readr::read_delim("./RInputFiles/data_4_3.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## GS = col_double(),
## KO = col_double(),
## DIS = col_double(),
## CAT = col_double()
## )
# names(tmpData) <- c("Index", "apple")
data <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot the four stocks on the same graphical window
par(mfrow=c(2, 2), mex=0.8, cex=0.8)
plot(data[, 1])
plot(data[, 2])
plot(data[, 3])
plot(data[, 4])
par(mfrow=c(1, 1), mex=1, cex=1)
# In this exercise, you are provided with four individual series containing the return of the same four stocks:
# gs, ko, dis, cat
# Solution makes absolutely no sense
portfolio <- as.numeric(tmpPortfolioData$return)
gs <- as.numeric(TTR::ROC(data[, "GS"]))[-1]
ko <- as.numeric(TTR::ROC(data[, "KO"]))[-1]
dis <- as.numeric(TTR::ROC(data[, "DIS"]))[-1]
cat <- as.numeric(TTR::ROC(data[, "CAT"]))[-1]
# Draw the scatterplot of gs against the portfolio
plot(y=portfolio, x=gs)
# Add a regression line in red
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
# Plot scatterplots and regression lines to a 2x2 window
par(mfrow=c(2, 2))
plot(y=portfolio, x=gs)
abline(reg=lm(gs ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=ko)
abline(reg=lm(ko ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=dis)
abline(reg=lm(dis ~ portfolio), col="red", lwd=2)
plot(y=portfolio, x=cat)
abline(reg=lm(cat ~ portfolio), col="red", lwd=2)
par(mfrow=c(1, 1))
# In this exercise, you are given a dataset old.vs.new.portfolio with the following self-explanatory columns: old.portfolio.value, new.portfolio.value, old.portfolio.rtn, new.portfolio.rtn
tmpData <- readr::read_delim("./RInputFiles/old.vs.new.portfolio.csv", delim=",")
## Parsed with column specification:
## cols(
## Index = col_date(format = ""),
## old.portfolio.value = col_double(),
## new.portfolio.value = col_double(),
## old.portfolio.rtn = col_double(),
## new.portfolio.rtn = col_double()
## )
# names(tmpData) <- c("Index", "apple")
old.vs.new.portfolio <- xts::xts(tmpData[, -1], order.by=as.Date(tmpData$Index))
# indexClass(data) <- c("POSIXt", "POSIXlt")
# Plot new and old portfolio values on same chart
plot(old.vs.new.portfolio$old.portfolio.value)
lines(old.vs.new.portfolio$new.portfolio.value, col = "red")
# Plot density of the new and old portfolio returns on same chart
plot(density(old.vs.new.portfolio$old.portfolio.rtn))
lines(density(old.vs.new.portfolio$new.portfolio.rtn), col ="red")
# Draw value, return, drawdowns of old portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "old.portfolio.rtn"])
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, "new.portfolio.rtn"])
# Draw both portfolios on same chart
# Draw value, return, drawdowns of new portfolio
PerformanceAnalytics::charts.PerformanceSummary(old.vs.new.portfolio[, c("old.portfolio.rtn", "new.portfolio.rtn")])
Chapter 1 - Custom ggplot2 themes
Introduction to the data - finding stories in datasets:
Filtering and plotting the data:
Custom ggplot2 themes - providing a custom look to a chart:
Example code includes:
library(ggplot2)
load("./RInputFiles/ilo_hourly_compensation.RData")
load("./RInputFiles/ilo_working_hours.RData")
# Join both data frames
ilo_data <- ilo_hourly_compensation %>%
inner_join(ilo_working_hours, by = c("country", "year"))
# Count the resulting rows
ilo_data %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 612
# Examine ilo_data
ilo_data
## # A tibble: 612 x 4
## country year hourly_compensation working_hours
## <chr> <chr> <dbl> <dbl>
## 1 Australia 1980.0 8.44 34.6
## 2 Canada 1980.0 8.87 34.8
## 3 Denmark 1980.0 10.8 31.9
## 4 Finland 1980.0 8.61 35.6
## 5 France 1980.0 8.90 35.4
## 6 Italy 1980.0 8.09 35.7
## 7 Japan 1980.0 5.46 40.8
## 8 Korea, Rep. 1980.0 0.950 55.3
## 9 Norway 1980.0 11.8 30.4
## 10 Spain 1980.0 5.86 36.8
## # ... with 602 more rows
# Turn year into a factor
ilo_data <- ilo_data %>%
mutate(year = as.factor(as.numeric(year)))
# Turn country into a factor
ilo_data <- ilo_data %>%
mutate(country = as.factor(country))
# Examine the European countries vector
european_countries <- c('Finland', 'France', 'Italy', 'Norway', 'Spain', 'Sweden', 'Switzerland', 'United Kingdom', 'Belgium', 'Ireland', 'Luxembourg', 'Portugal', 'Netherlands', 'Germany', 'Hungary', 'Austria', 'Czech Rep.')
european_countries
## [1] "Finland" "France" "Italy" "Norway"
## [5] "Spain" "Sweden" "Switzerland" "United Kingdom"
## [9] "Belgium" "Ireland" "Luxembourg" "Portugal"
## [13] "Netherlands" "Germany" "Hungary" "Austria"
## [17] "Czech Rep."
# Only retain European countries
ilo_data <- ilo_data %>%
filter(country %in% european_countries)
# Examine the structure of ilo_data
str(ilo_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 380 obs. of 4 variables:
## $ country : Factor w/ 30 levels "Australia","Austria",..: 8 9 15 22 25 27 28 29 8 9 ...
## $ year : Factor w/ 27 levels "1980","1981",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ hourly_compensation: num 8.61 8.9 8.09 11.8 5.86 ...
## $ working_hours : num 35.6 35.4 35.7 30.4 36.8 ...
# Group and summarize the data
ilo_data %>%
group_by(year) %>%
summarize(mean_hourly_compensation = mean(hourly_compensation),
mean_working_hours = mean(working_hours))
## # A tibble: 27 x 3
## year mean_hourly_compensation mean_working_hours
## <fct> <dbl> <dbl>
## 1 1980 9.27 34.0
## 2 1981 8.69 33.6
## 3 1982 8.36 33.5
## 4 1983 7.81 33.9
## 5 1984 7.54 33.7
## 6 1985 7.79 33.7
## 7 1986 9.70 34.0
## 8 1987 12.1 33.6
## 9 1988 13.2 33.7
## 10 1989 13.1 33.5
## # ... with 17 more rows
# Filter for 2006
plot_data <- ilo_data %>%
filter(year == 2006)
# Create the scatter plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation))
# Create the plot
ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
# Add labels
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Save your current plot into a variable: ilo_plot
ilo_plot <- ggplot(plot_data) +
geom_point(aes(x = working_hours, y = hourly_compensation)) +
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
)
# Try out theme_minimal
ilo_plot +
theme_minimal()
# Try out any other possible theme function
ilo_plot +
theme_linedraw()
windowsFonts(Bookman=windowsFont("Bookman Old Style"))
ilo_plot <- ilo_plot +
theme_minimal() +
# Customize the "minimal" theme with another custom "theme" call
theme(
text = element_text(family = "Bookman"),
title = element_text(color = "gray25"),
plot.subtitle = element_text(size=12),
plot.caption = element_text(color = "gray30")
)
# Render the plot object
ilo_plot
ilo_plot +
# "theme" calls can be stacked upon each other, so this is already the third call of "theme"
theme(
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm")
)
Chapter 2 - Creating Custom and Unique Visualization
Visualizing aspects of data with facets:
Custom plot to emphasize change:
Polishing the dot plot:
Finalizing plots for different audiences and devices:
Example code includes:
# Filter ilo_data to retain the years 1996 and 1996
ilo_data <- ilo_data %>%
filter(year == 1996 | year == 2006)
# Again, you save the plot object into a variable so you can save typing later on
ilo_plot <- ggplot(ilo_data, aes(x = working_hours, y = hourly_compensation)) +
geom_point() +
labs(
x = "Working hours per week",
y = "Hourly compensation",
title = "The more people work, the less compensation they seem to receive",
subtitle = "Working hours and hourly compensation in European countries, 2006",
caption = "Data source: ILO, 2017"
) +
# Add facets here
facet_grid(facets = . ~ year)
ilo_plot
# For a starter, let's look at what you did before: adding various theme calls to your plot object
ilo_plot +
theme_minimal() +
theme(
text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(color = "gray30"),
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm")
)
# Define your own theme function below
theme_ilo <- function() {
theme_minimal() +
theme(
text = element_text(family = "Bookman", color = "gray25"),
plot.subtitle = element_text(size = 12),
plot.caption = element_text(color = "gray30"),
plot.background = element_rect(fill = "gray95"),
plot.margin = unit(c(5, 10, 5, 10), units = "mm"))
}
# Apply your theme function
ilo_plot <- ilo_plot + theme_ilo()
# Examine ilo_plot
ilo_plot
ilo_plot +
# Add another theme call
theme(
# Change the background fill to make it a bit darker
strip.background = element_rect(fill = "gray60", color = "gray95"),
# Make text a bit bigger and change its color to white
strip.text = element_text(size = 11, color = "white")
)
# Create the dot plot
ggplot(ilo_data) +
geom_path(aes(x=working_hours, y=country))
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
# Add an arrow to each path
arrow = arrow(length = unit(1.5, "mm"), type = "closed"))
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
# Add a geom_text() geometry
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1))
)
library(forcats)
# Reorder country factor levels
ilo_data <- ilo_data %>%
# Arrange data frame
arrange(country, year) %>%
# Reorder countries by working hours in 2006
mutate(country = fct_reorder(country,
working_hours,
last))
# Plot again
ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1))
)
# Save plot into an object for reuse
ilo_dot_plot <- ggplot(ilo_data) +
geom_path(aes(x = working_hours, y = country),
arrow = arrow(length = unit(1.5, "mm"), type = "closed")) +
# Specify the hjust aesthetic with a conditional value
geom_text(
aes(x = working_hours,
y = country,
label = round(working_hours, 1),
hjust = ifelse(year == "2006", 1.4, -0.4)
),
# Change the appearance of the text
size = 3,
family = "Bookman",
color = "gray25"
)
ilo_dot_plot
# Reuse ilo_dot_plot
ilo_dot_plot <- ilo_dot_plot +
# Add labels to the plot
labs(
x = "Working hours per week",
y = "Country",
title = "People work less in 2006 compared to 1996",
subtitle = "Working hours in European countries, development since 1996",
caption = "Data source: ILO, 2017"
) +
# Apply your theme
theme_ilo() +
# Change the viewport
coord_cartesian(xlim = c(25, 41))
# View the plot
ilo_dot_plot
# Compute temporary data set for optimal label placement
median_working_hours <- ilo_data %>%
group_by(country) %>%
summarize(median_working_hours_per_country = median(working_hours)) %>%
ungroup()
# Have a look at the structure of this data set
str(median_working_hours)
## Classes 'tbl_df', 'tbl' and 'data.frame': 17 obs. of 2 variables:
## $ country : Factor w/ 30 levels "Netherlands",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ median_working_hours_per_country: num 27 27.8 28.4 31 30.9 ...
ilo_dot_plot +
# Add label for country
geom_text(data = median_working_hours,
aes(y = country,
x = median_working_hours_per_country,
label = country),
vjust = -0.5,
size=3,
family = "Bookman",
color = "gray25") +
# Remove axes and grids
theme(
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
panel.grid = element_blank(),
# Also, let's reduce the font size of the subtitle
plot.subtitle = element_text(size = 9)
)
Chapter 3 - Introduction to R Markdown
What is R Markdown?
Formatting with R Markdown:
R Code in R Markdown Documents:
Images in R Markdown Files:
Example code is contained in the summary Excel worksheet.
Chapter 4 - Customizing R Markdown Reports
Advanced YAML Settings (YAML is a recursive name meaning YAML and Markup Language):
Custom stylesheets - creating a unique theme for a report:
Beautiful tables:
Summary:
Example code is contained in the summary Excel worksheet.
Chapter 1 - Binomial Distribution
Flipping coins in R - for example, rbinom(1, 1, 0.5) - 1 draw of 1 coint with 50% of being heads:
Density and cumulative density:
Expected value and variance:
Example code includes:
# Generate 10 separate random flips with probability .3
rbinom(10, 1, 0.3)
## [1] 0 1 0 0 1 1 0 0 1 0
# Generate 100 occurrences of flipping 10 coins, each with 30% probability
rbinom(100, 10, 0.3)
## [1] 2 1 6 4 2 3 3 6 8 5 1 1 3 7 1 5 4 6 4 3 4 2 4 2 4 1 2 5 1 7 2 5 2 5 3
## [36] 4 5 2 3 3 0 4 3 3 5 2 4 1 2 3 2 1 4 5 4 0 5 6 5 2 1 2 3 2 2 4 2 5 3 5
## [71] 3 4 1 2 4 1 3 2 6 3 4 2 4 6 6 2 2 2 4 6 4 4 2 1 4 3 0 4 3 3
# Calculate the probability that 2 are heads using dbinom
dbinom(2, 10, 0.3)
## [1] 0.2334744
# Confirm your answer with a simulation using rbinom
mean(rbinom(10000, 10, 0.3) == 2)
## [1] 0.2353
# Calculate the probability that at least five coins are heads
1 - pbinom(4, 10, 0.3)
## [1] 0.1502683
# Confirm your answer with a simulation of 10,000 trials
mean(rbinom(10000, 10, 0.3) >= 5)
## [1] 0.1533
# Here is how you computed the answer in the last problem
mean(rbinom(10000, 10, .3) >= 5)
## [1] 0.149
# Try now with 100, 1000, 10,000, and 100,000 trials
mean(rbinom(100, 10, .3) >= 5)
## [1] 0.16
mean(rbinom(1000, 10, .3) >= 5)
## [1] 0.158
mean(rbinom(10000, 10, .3) >= 5)
## [1] 0.1518
mean(rbinom(100000, 10, .3) >= 5)
## [1] 0.15187
# Calculate the expected value using the exact formula
25 * 0.3
## [1] 7.5
# Confirm with a simulation using rbinom
mean(rbinom(10000, 25, 0.3))
## [1] 7.4447
# Calculate the variance using the exact formula
25 * 0.3 * (1 - 0.3)
## [1] 5.25
# Confirm with a simulation using rbinom
var(rbinom(10000, 25, 0.3))
## [1] 5.15845
Chapter 2 - Laws of Probability
Probability of Event A and Event B:
Probability of A or B:
Multiplying random variables:
Adding random variables:
Example code includes:
# Simulate 100,000 flips of a coin with a 40% chance of heads
A <- rbinom(100000, 1, 0.4)
# Simulate 100,000 flips of a coin with a 20% chance of heads
B <- rbinom(100000, 1, 0.2)
# Estimate the probability both A and B are heads
mean(A & B)
## [1] 0.0805
# You've already simulated 100,000 flips of coins A and B
A <- rbinom(100000, 1, .4)
B <- rbinom(100000, 1, .2)
# Simulate 100,000 flips of coin C (70% chance of heads)
C <- rbinom(100000, 1, .7)
# Estimate the probability A, B, and C are all heads
mean(A & B & C)
## [1] 0.05589
# Simulate 100,000 flips of a coin with a 60% chance of heads
A <- rbinom(100000, 1, 0.6)
# Simulate 100,000 flips of a coin with a 10% chance of heads
B <- rbinom(100000, 1, 0.1)
# Estimate the probability either A or B is heads
mean(A | B)
## [1] 0.63736
# Use rbinom to simulate 100,000 draws from each of X and Y
X <- rbinom(100000, 10, 0.6)
Y <- rbinom(100000, 10, 0.7)
# Estimate the probability either X or Y is <= to 4
mean((X <= 4) | (Y <= 4))
## [1] 0.20613
# Use pbinom to calculate the probabilities separately
prob_X_less <- pbinom(4, 10, 0.6)
prob_Y_less <- pbinom(4, 10, 0.7)
# Combine these to calculate the exact probability either <= 4
prob_X_less + prob_Y_less - prob_X_less * prob_Y_less
## [1] 0.2057164
# Simulate 100,000 draws of a binomial with size 20 and p = .1
X <- rbinom(100000, 20, 0.1)
# Estimate the expected value of X
mean(X)
## [1] 1.9991
# Estimate the expected value of 5 * X
mean(5 * X)
## [1] 9.9955
# Estimate the variance of X
var(X)
## [1] 1.786197
# Estimate the variance of 5 * X
var(5 * X)
## [1] 44.65493
# Simulate 100,000 draws of X (size 20, p = .3) and Y (size 40, p = .1)
X <- rbinom(100000, 20, 0.3)
Y <- rbinom(100000, 40, 0.1)
# Estimate the expected value of X + Y
mean(X + Y)
## [1] 9.99048
# Find the variance of X + Y
var(X + Y)
## [1] 7.798627
# Find the variance of 3 * X + Y
var(3 * X + Y)
## [1] 41.20331
Chapter 3 - Bayesian Statistics
Updating with evidence:
Prior probability - may not be equal odds prior to seeing any evidence:
Bayes theorem:
Example code includes:
# Simulate 50000 cases of flipping 20 coins from fair and from biased
fair <- rbinom(50000, 20, 0.5)
biased <- rbinom(50000, 20, 0.75)
# How many fair cases, and how many biased, led to exactly 11 heads?
fair_11 <- sum(fair == 11)
biased_11 <- sum(biased == 11)
# Find the fraction of fair coins that are 11 out of all coins that were 11
fair_11 / (fair_11 + biased_11)
## [1] 0.8487457
# How many fair cases, and how many biased, led to exactly 16 heads?
fair_16 <- sum(fair == 16)
biased_16 <- sum(biased == 16)
# Find the fraction of fair coins that are 16 out of all coins that were 16
fair_16 / (fair_16 + biased_16)
## [1] 0.02418033
# Simulate 8000 cases of flipping a fair coin, and 2000 of a biased coin
fair_flips <- rbinom(8000, 20, 0.5)
biased_flips <- rbinom(2000, 20, 0.75)
# Find the number of cases from each coin that resulted in 14/20
fair_14 <- sum(fair_flips == 14)
biased_14 <- sum(biased_flips == 14)
# Use these to estimate the posterior probability
fair_14 / (fair_14 + biased_14)
## [1] 0.4651515
# Simulate 80,000 draws from fair coin, 10,000 from each of high and low coins
flips_fair <- rbinom(80000, 20, 0.5)
flips_high <- rbinom(10000, 20, 0.75)
flips_low <- rbinom(10000, 20, 0.25)
# Compute the number of coins that resulted in 14 heads from each of these piles
fair_14 <- sum(flips_fair == 14)
high_14 <- sum(flips_high == 14)
low_14 <- sum(flips_low == 14)
# Compute the posterior probability that the coin was fair
fair_14 / (fair_14 + high_14 + low_14)
## [1] 0.6370197
# Use dbinom to calculate the probability of 11/20 heads with fair or biased coin
probability_fair <- dbinom(11, 20, 0.5)
probability_biased <- dbinom(11, 20, 0.75)
# Calculate the posterior probability that the coin is fair
probability_fair / (probability_fair + probability_biased)
## [1] 0.8554755
# Find the probability that a coin resulting in 14/20 is fair
probability_fair <- dbinom(14, 20, .5)
probability_biased <- dbinom(14, 20, .75)
probability_fair / (probability_fair + probability_biased)
## [1] 0.179811
# Find the probability that a coin resulting in 18/20 is fair
probability_fair <- dbinom(18, 20, .5)
probability_biased <- dbinom(18, 20, .75)
probability_fair / (probability_fair + probability_biased)
## [1] 0.002699252
# Use dbinom to find the probability of 16/20 from a fair or biased coin
probability_16_fair <- dbinom(16, 20, 0.5)
probability_16_biased <- dbinom(16, 20, 0.75)
# Use Bayes' theorem to find the posterior probability that the coin is fair
(probability_16_fair * 0.99) / (probability_16_fair * 0.99 + probability_16_biased * 0.01)
## [1] 0.7068775
Chapter 4 - Related Distributions
Normal distribution - symmetrical bell curve, Gaussian:
Poisson distribution - approximates the binomial under the assumption of a large number of trials each with a low probability:
Geometric distribution - example of flipping a coin with probability p and assessing when the first success occurs:
Example code includes:
compare_histograms <- function(variable1, variable2) {
x <- data.frame(value = variable1, variable = "Variable 1")
y <- data.frame(value = variable2, variable = "Variable 2")
ggplot(rbind(x, y), aes(value)) +
geom_histogram() +
facet_wrap(~ variable, nrow = 2)
}
# Draw a random sample of 100,000 from the Binomial(1000, .2) distribution
binom_sample <- rbinom(100000, 1000, 0.2)
# Draw a random sample of 100,000 from the normal approximation
normal_sample <- rnorm(100000, 200, sqrt(160))
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, normal_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Use binom_sample to estimate the probability of <= 190 heads
mean(binom_sample <= 190)
## [1] 0.2292
# Use normal_sample to estimate the probability of <= 190 heads
mean(normal_sample <= 190)
## [1] 0.2152
# Calculate the probability of <= 190 heads with pbinom
pbinom(190, 1000, 0.2)
## [1] 0.2273564
# Calculate the probability of <= 190 heads with pnorm
pnorm(190, 200, sqrt(160))
## [1] 0.2145977
# Draw a random sample of 100,000 from the Binomial(10, .2) distribution
binom_sample <- rbinom(100000, 10, 0.2)
# Draw a random sample of 100,000 from the normal approximation
normal_sample <- rnorm(100000, 2, sqrt(1.6))
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, normal_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Draw a random sample of 100,000 from the Binomial(1000, .002) distribution
binom_sample <- rbinom(100000, 1000, 0.002)
# Draw a random sample of 100,000 from the Poisson approximation
poisson_sample <- rpois(100000, 2)
# Compare the two distributions with the compare_histograms function
compare_histograms(binom_sample, poisson_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the percentage of simulated values that are 0
mean(poisson_sample == 0)
## [1] 0.13513
# Use dpois to find the exact probability that a draw is 0
dpois(0, 2)
## [1] 0.1353353
# Simulate 100,000 draws from Poisson(1)
X <- rpois(100000, 1)
# Simulate 100,000 draws from Poisson(2)
Y <- rpois(100000, 2)
# Add X and Y together to create Z
Z <- X + Y
# Use compare_histograms to compare Z to the Poisson(3)
compare_histograms(Z, rpois(100000, 3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Simulate 100 instances of flipping a 20% coin
flips <- rbinom(100, 1, 0.2)
# Use which to find the first case of 1 ("heads")
which(flips == 1)[1]
## [1] 6
# Existing code for finding the first instance of heads
which(rbinom(100, 1, .2) == 1)[1]
## [1] 5
# Replicate this 100,000 times using replicate()
replications <- replicate(100000, which(rbinom(100, 1, .2) == 1)[1])
# Histogram the replications with qplot
qplot(replications)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Generate 100,000 draws from the corresponding geometric distribution
geom_sample <- rgeom(100000, 0.2)
# Compare the two distributions with compare_histograms
compare_histograms(replications, geom_sample)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the probability the machine breaks on 5th day or earlier
pgeom(4, 0.1)
## [1] 0.40951
# Find the probability the machine is still working on 20th day
1 - pgeom(19, 0.1)
## [1] 0.1215767
# Calculate the probability of machine working on day 1-30
still_working <- 1 - pgeom(0:29, 0.1)
# Plot the probability for days 1 to 30
qplot(1:30, still_working)
Chapter 1 - Bootstrapping for Parameter Estimates
Introduction - beginning with bootstrapping approach:
Percentile and standard error methods:
Re-centering bootstrap distributions for hypothesis testing:
Example code includes:
manhattan <- readr::read_csv("./RInputFiles/manhattan.csv")
## Parsed with column specification:
## cols(
## rent = col_integer()
## )
# Will need to either call library(infer) or add infer:: to this code
library(infer)
# Generate bootstrap distribution of medians
rent_ci_med <- manhattan %>%
# Specify the variable of interest
specify(response = rent) %>%
# Generate 15000 bootstrap samples
generate(reps = 15000, type = "bootstrap") %>%
# Calculate the median of each bootstrap sample
calculate(stat = "median")
# View the structure of rent_ci_med
str(rent_ci_med)
## Classes 'tbl_df', 'tbl' and 'data.frame': 15000 obs. of 2 variables:
## $ replicate: int 1 2 3 4 5 6 7 8 9 10 ...
## $ stat : num 2422 2350 2262 2325 2350 ...
## - attr(*, "response")= symbol rent
# Plot a histogram of rent_ci_med
ggplot(rent_ci_med, aes(x=stat)) +
geom_histogram(binwidth=50)
# Percentile method
rent_ci_med %>%
summarize(l = quantile(stat, 0.025),
u = quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 2162 2875
# Standard error method
# Calculate observed median
rent_med_obs <- manhattan %>%
# Calculate observed median rent
summarize(median(rent)) %>%
# Extract numerical value
pull()
# Determine critical value
t_star <- qt(0.975, df = nrow(manhattan) - 1)
# Construct interval
rent_ci_med %>%
summarize(boot_se = sd(rent_ci_med$stat)) %>%
summarize(l = rent_med_obs - t_star * boot_se,
u = rent_med_obs + t_star * boot_se)
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 1994 2706
data(ncbirths, package="openintro")
str(ncbirths)
## 'data.frame': 1000 obs. of 13 variables:
## $ fage : int NA NA 19 21 NA NA 18 17 NA 20 ...
## $ mage : int 13 14 15 15 15 15 15 15 16 16 ...
## $ mature : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
## $ weeks : int 39 42 37 41 39 38 37 35 38 37 ...
## $ premie : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
## $ visits : int 10 15 11 6 9 19 12 5 9 13 ...
## $ marital : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
## $ gained : int 38 20 38 34 27 22 76 15 NA 52 ...
## $ weight : num 7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
## $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
## $ habit : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
## $ whitemom : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...
# Remove NA visits
ncbirths_complete_visits <- ncbirths %>%
filter(!is.na(visits))
# Generate 15000 bootstrap means
visit_ci_mean <- ncbirths_complete_visits %>%
specify(response=visits) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="mean")
# Calculate the 90% CI via percentile method
visit_ci_mean %>%
summarize(l = quantile(stat, 0.05),
u = quantile(stat, 0.95))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 11.9 12.3
# Calculate 15000 bootstrap SDs
visit_ci_sd <- ncbirths_complete_visits %>%
specify(response=visits) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="sd")
# Calculate the 90% CI via percentile method
visit_ci_sd %>%
summarize(l = quantile(stat, 0.05),
u = quantile(stat, 0.95))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 3.74 4.16
# Generate 15000 bootstrap samples centered at null
rent_med_ht <- manhattan %>%
specify(response = rent) %>%
hypothesize(null = "point", med = 2500) %>%
generate(reps = 15000, type = "bootstrap") %>%
calculate(stat = "median")
# Calculate observed median
rent_med_obs <- manhattan %>%
summarize(median(rent)) %>%
pull()
# Calculate p-value
rent_med_ht %>%
filter(stat > rent_med_obs) %>%
summarize(n() / 15000)
## # A tibble: 1 x 1
## `n()/15000`
## <dbl>
## 1 0.948
# Generate 1500 bootstrap means centered at null
weight_mean_ht <- ncbirths %>%
specify(response = weight) %>%
hypothesize(null = "point", mu = 7) %>%
generate(reps=1500, type="bootstrap") %>%
calculate(stat="mean")
# Calculate observed mean
weight_mean_obs <- ncbirths %>%
summarize(mean(weight)) %>%
pull()
# Calculate p-value
weight_mean_ht %>%
filter(stat > weight_mean_obs) %>%
summarize((n()/1500) * 2)
## # A tibble: 1 x 1
## `(n()/1500) * 2`
## <dbl>
## 1 0.0253
Chapter 2 - Introducing the t-distribution
The t-distribution - especially useful when the population standard deviation is unknown (as is typically the case):
Estimating a mean with a t-interval:
The t-interval for paired data:
Testing a mean with a t-test:
Example code includes:
# P(T < 3) for df = 10
(x <- pt(3, df = 10))
## [1] 0.9933282
# P(T > 3) for df = 10
(y <- 1 - pt(3, df=10))
## [1] 0.006671828
# P(T > 3) for df = 100
(z <- 1 - pt(3, df=100))
## [1] 0.001703958
# Comparison
y == z
## [1] FALSE
y > z
## [1] TRUE
y < z
## [1] FALSE
# 95th percentile for df = 10
(x <- qt(0.95, df = 10))
## [1] 1.812461
# upper bound of middle 95th percent for df = 10
(y <- qt(0.975, df = 10))
## [1] 2.228139
# upper bound of middle 95th percent for df = 100
(z <- qt(0.975, df = 100))
## [1] 1.983972
# Comparison
y == z
## [1] FALSE
y > z
## [1] TRUE
y < z
## [1] FALSE
data(acs12, package="openintro")
# Subset for employed respondents
acs12_emp <- acs12 %>%
filter(employment == "employed")
# Construct 95% CI for avg time_to_work
t.test(acs12_emp$time_to_work, conf.level=0.95)
##
## One Sample t-test
##
## data: acs12_emp$time_to_work
## t = 32.635, df = 782, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 24.43369 27.56120
## sample estimates:
## mean of x
## 25.99745
t.test(acs12_emp$hrs_work, conf.level=0.95)
##
## One Sample t-test
##
## data: acs12_emp$hrs_work
## t = 87.521, df = 842, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 38.05811 39.80429
## sample estimates:
## mean of x
## 38.9312
data(textbooks, package="openintro")
# 90% CI
t.test(textbooks$diff, conf.level = 0.9)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
## 9.981505 15.541783
## sample estimates:
## mean of x
## 12.76164
# 95% CI
t.test(textbooks$diff, conf.level = 0.95)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 9.435636 16.087652
## sample estimates:
## mean of x
## 12.76164
# 99% CI
t.test(textbooks$diff, conf.level = 0.99)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 99 percent confidence interval:
## 8.347154 17.176133
## sample estimates:
## mean of x
## 12.76164
# Conduct HT
t.test(textbooks$diff, mu=0, alternative="two.sided", conf.level=0.95)
##
## One Sample t-test
##
## data: textbooks$diff
## t = 7.6488, df = 72, p-value = 6.928e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 9.435636 16.087652
## sample estimates:
## mean of x
## 12.76164
# Calculate 15000 bootstrap means
textdiff_med_ci <- textbooks %>%
specify(response = diff) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat = "median")
# Calculate the 95% CI via percentile method
textdiff_med_ci %>%
summarize(l=quantile(stat, 0.025),
u=quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 5.04 11.7
data(hsb2, package="openintro")
# Calculate diff
hsb2 <- hsb2 %>%
mutate(diff = math - science)
# Generate 15000 bootstrap means centered at null
scorediff_med_ht <- hsb2 %>%
specify(response=diff) %>%
hypothesize(null="point", mu=0) %>%
generate(reps=15000, type="bootstrap") %>%
calculate(stat="median")
# Calculate observed median of differences
scorediff_med_obs <- hsb2 %>%
summarize(median(diff)) %>%
pull()
# Calculate p-value
scorediff_med_ht %>%
filter(stat > scorediff_med_obs) %>%
summarize(p_val = (n() / 15000) * 2)
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0.529
Chapter 3 - Inference for Difference in Two Parameters
Hypothesis testing for comparing two means:
Bootstrap CI for difference in two means:
Comparing means with a t-test:
Example code includes:
data(stem.cell, package="openintro")
str(stem.cell)
## 'data.frame': 18 obs. of 3 variables:
## $ trmt : Factor w/ 2 levels "ctrl","esc": 1 1 1 1 1 1 1 1 1 2 ...
## $ before: num 35.2 36.5 39.8 39.8 41.8 ...
## $ after : num 29.5 29.5 36.2 38 37.5 ...
# Calculate difference between before and after
stem.cell <- stem.cell %>%
mutate(change = after - before)
# Calculate observed difference in means
diff_mean <- stem.cell %>%
# Group by treatment group
group_by(trmt) %>%
# Calculate mean change for each group
summarize(mean_change = mean(change)) %>%
# Extract
pull() %>%
# Calculate difference
diff()
# Generate 1000 differences in means via randomization
diff_ht_mean <- stem.cell %>%
# y ~ x
specify(change ~ trmt) %>%
# Null = no difference between means
hypothesize(null = "independence") %>%
# Shuffle labels 1000 times
generate(reps = 1000, type = "permute") %>%
# Calculate test statistic
calculate(stat = "diff in means", order=rev(levels(stem.cell$trmt)))
# Calculate p-value
diff_ht_mean %>%
# Identify simulated test statistics at least as extreme as observed
filter(stat > diff_mean) %>%
# Calculate p-value
summarize(p_val = (n() / 1000))
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0
# Remove subjects with missing habit
ncbirths_complete_habit <- ncbirths %>%
filter(!is.na(habit))
# Calculate observed difference in means
diff_mean <- ncbirths_complete_habit %>%
# Group by habit group
group_by(habit) %>%
# Calculate mean weight for each group
summarize(mean_weight = mean(weight)) %>%
# Extract
pull() %>%
# Calculate difference
diff()
# Generate 1000 differences in means via randomization
diff_ht_mean <- ncbirths_complete_habit %>%
# y ~ x
specify(weight ~ habit) %>%
# Null = no difference between means
hypothesize(null = "independence") %>%
# Shuffle labels 1000 times
generate(reps = 1000, type = "permute") %>%
# Calculate test statistic
calculate(stat = "diff in means", order=rev(levels(ncbirths_complete_habit$habit)))
# Calculate p-value
diff_ht_mean %>%
# Identify simulated test statistics at least as extreme as observed
filter(stat < diff_mean) %>%
# Calculate p-value
summarize(p_val = (n()/1000) * 2)
## # A tibble: 1 x 1
## p_val
## <dbl>
## 1 0.0280
# Generate 1500 bootstrap difference in means
diff_mean_ci <- ncbirths_complete_habit %>%
specify(weight ~ habit) %>%
generate(reps = 1500, type = "bootstrap") %>%
calculate(stat = "diff in means", order=rev(levels(ncbirths_complete_habit$habit)))
# Calculate the 95% CI via percentile method
diff_mean_ci %>%
summarize(l=quantile(stat, 0.025),
u=quantile(stat, 0.975))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 -0.583 -0.0530
# Remove subjects with missing habit and weeks
ncbirths_complete_habit_weeks <- ncbirths %>%
filter(!is.na(habit) & !is.na(weeks))
# Generate 1500 bootstrap difference in medians
diff_med_ci <- ncbirths_complete_habit_weeks %>%
specify(weeks ~ habit) %>%
generate(reps = 1500, type = "bootstrap") %>%
calculate(stat="diff in medians", order=rev(levels(ncbirths_complete_habit_weeks$habit)))
# Calculate the 92% CI via percentile method
diff_med_ci %>%
summarize(l=quantile(stat, 0.04),
u=quantile(stat, 0.96))
## # A tibble: 1 x 2
## l u
## <dbl> <dbl>
## 1 -1.00 0
# Create hrly_pay and filter for non-missing hrly_pay and citizen
acs12_complete_hrlypay_citizen <- acs12 %>%
mutate(hrly_pay = income / (hrs_work * 52)) %>%
filter(
!is.na(hrly_pay),
!is.na(citizen)
)
# Calculate percent missing
new_n <- nrow(acs12_complete_hrlypay_citizen)
old_n <- nrow(acs12)
(perc_missing <- (old_n - new_n) / old_n)
## [1] 0.5205
# Calculate summary statistics
acs12_complete_hrlypay_citizen %>%
group_by(citizen) %>%
summarize(
x_bar = mean(hrly_pay),
s = sd(hrly_pay),
n = n()
)
## # A tibble: 2 x 4
## citizen x_bar s n
## <fct> <dbl> <dbl> <int>
## 1 no 21.2 34.5 58
## 2 yes 18.5 24.7 901
# Plot the distributions
ggplot(data = acs12_complete_hrlypay_citizen, mapping = aes(x = hrly_pay)) +
geom_histogram(binwidth = 5) +
facet_grid(. ~ citizen, labeller = labeller(citizen = c(no = "Non citizen",
yes = "Citizen")))
# Construct 95% CI
t.test(hrly_pay ~ citizen, data=acs12_complete_hrlypay_citizen, null=0, alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: hrly_pay by citizen
## t = 0.58058, df = 60.827, p-value = 0.5637
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.53483 11.88170
## sample estimates:
## mean in group no mean in group yes
## 21.19494 18.52151
Chapter 4 - Comparing Many Means
Vocabulary score vary between social class:
ANOVA - Analysis of Variance:
Conditions for ANOVA:
Post-hoc testing - determining which of the means are different:
Wrap-up:
Example code includes:
gss <- readr::read_csv("./RInputFiles/gss_wordsum_class.csv")
## Parsed with column specification:
## cols(
## wordsum = col_integer(),
## class = col_character()
## )
str(gss)
## Classes 'tbl_df', 'tbl' and 'data.frame': 795 obs. of 2 variables:
## $ wordsum: int 6 9 6 5 6 6 8 10 8 9 ...
## $ class : chr "MIDDLE" "WORKING" "WORKING" "WORKING" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ wordsum: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ class : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
ggplot(gss, aes(x=wordsum)) +
geom_histogram(binwidth=1) +
facet_grid(class ~ .)
aov_wordsum_class <- aov(wordsum ~ class, data=gss)
broom::tidy(aov_wordsum_class)
## term df sumsq meansq statistic p.value
## 1 class 3 236.5644 78.854810 21.73467 1.560565e-13
## 2 Residuals 791 2869.8003 3.628066 NA NA
gss %>%
group_by(class) %>%
summarize(s = sd(wordsum))
## # A tibble: 4 x 2
## class s
## <chr> <dbl>
## 1 LOWER 2.24
## 2 MIDDLE 1.89
## 3 UPPER 2.34
## 4 WORKING 1.87
# Conduct the pairwise.t.test with p.adjust = "none" option (we'll adjust the significance level, not the p-value). The first argument is the response vector and the second argument is the grouping vector.
pairwise.t.test(gss$wordsum, gss$class, p.adjust = "none") %>%
broom::tidy()
## group1 group2 p.value
## 1 MIDDLE LOWER 1.133345e-07
## 2 UPPER LOWER 4.752521e-02
## 3 WORKING LOWER 3.055619e-02
## 5 UPPER MIDDLE 2.395734e-01
## 6 WORKING MIDDLE 1.631637e-12
## 9 WORKING UPPER 3.670775e-01
Chapter 1 - Introduction to Correlation Coefficients
How are correlation coefficients calculated?
Usefulness of correlation coefficients:
Points of caution:
Example code includes:
PE <- read.table("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.04.txt", header=TRUE)
# Take a quick peek at both vectors
(A <- c(1, 2, 3))
## [1] 1 2 3
(B <- c(3, 6, 7))
## [1] 3 6 7
# Save the differences of each vector element with the mean in a new variable
diff_A <- A - mean(A)
diff_B <- B - mean(B)
# Do the summation of the elements of the vectors and divide by N-1 in order to acquire the covariance between the two vectors
cov <- sum(diff_A*diff_B)/ (length(A)-1)
# Square the differences that were found in the previous step
sq_diff_A <- diff_A ** 2
sq_diff_B <- diff_B ** 2
# Take the sum of the elements, divide them by N-1 and consequently take the square root to acquire the sample standard deviations
sd_A <- sqrt(sum(sq_diff_A)/(length(A)-1))
sd_B <- sqrt(sum(sq_diff_B)/(length(B)-1))
# Combine all the pieces of the puzzle
correlation <- cov / (sd_A * sd_B)
correlation
## [1] 0.9607689
# Check the validity of your result with the cor() command
cor(A, B)
## [1] 0.9607689
# Read data from a URL into a dataframe called PE (physical endurance) - moved above to cache
# PE <- read.table("http://assets.datacamp.com/course/Conway/Lab_Data/Stats1.13.Lab.04.txt", header=TRUE)
# Summary statistics
psych::describe(PE)
## vars n mean sd median trimmed mad min max range skew
## pid 1 200 101.81 58.85 101.5 101.71 74.87 1 204 203 0.01
## age 2 200 49.41 10.48 48.0 49.46 10.38 20 82 62 0.06
## activeyears 3 200 10.68 4.69 11.0 10.57 4.45 0 26 26 0.30
## endurance 4 200 26.50 10.84 27.0 26.22 10.38 3 55 52 0.22
## kurtosis se
## pid -1.21 4.16
## age -0.14 0.74
## activeyears 0.46 0.33
## endurance -0.44 0.77
# Scatter plots
plot(PE$age ~ PE$activeyears)
plot(PE$endurance ~ PE$activeyears)
plot(PE$endurance ~ PE$age)
# Correlation Analysis
round(cor(PE[, !(names(PE) == "pid")]), 2)
## age activeyears endurance
## age 1.00 0.33 -0.08
## activeyears 0.33 1.00 0.33
## endurance -0.08 0.33 1.00
# Do some correlation tests. If the null hypothesis of no correlation can be rejected on a significance level of 5%, then the relationship between variables is significantly different from zero at the 95% confidence level
cor.test(PE$age, PE$activeyears)
##
## Pearson's product-moment correlation
##
## data: PE$age and PE$activeyears
## t = 4.9022, df = 198, p-value = 1.969e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1993491 0.4473145
## sample estimates:
## cor
## 0.3289909
cor.test(PE$endurance, PE$activeyears)
##
## Pearson's product-moment correlation
##
## data: PE$endurance and PE$activeyears
## t = 4.8613, df = 198, p-value = 2.37e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1967110 0.4451154
## sample estimates:
## cor
## 0.3265402
cor.test(PE$endurance, PE$age)
##
## Pearson's product-moment correlation
##
## data: PE$endurance and PE$age
## t = -1.1981, df = 198, p-value = 0.2323
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.22097811 0.05454491
## sample estimates:
## cor
## -0.08483813
# The impact dataset is already loaded in
rawImpactData <- " 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, control, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, concussed, 95, 90, 87, 84, 92, 89, 78, 97, 93, 90, 89, 97, 79, 86, 85, 85, 98, 95, 96, 92, 79, 85, 97, 89, 75, 75, 84, 93, 88, 97, 93, 96, 84, 89, 95, 95, 97, 95, 92, 95, 88, 82, 77, 72, 77, 79, 63, 82, 85, 66, 76, 79, 60, 59, 60, 76, 85, 83, 67, 84, 81, 85, 91, 74, 63, 68, 78, 74, 80, 73, 74, 70, 81, 72, 90, 74, 70, 63, 65, 69, 35.29, 31.47, 30.87, 41.87, 33.28, 40.73, 38.09, 31.65, 39.59, 30.53, 33.65, 37.51, 40.39, 32.88, 33.39, 35.13, 38.51, 29.64, 35.32, 27.36, 27.19, 32.66, 26.29, 28.92, 32.77, 32.92, 34.26, 36.08, 31.63, 28.89, 35.81, 33.61, 34.46, 39.18, 33.14, 33.03, 39.01, 35.06, 30.58, 38.45, 0.42, 0.63, 0.56, 0.66, 0.56, 0.81, 0.66, 0.79, 0.68, 0.60, 0.74, 0.51, 0.82, 0.59, 0.82, 0.63, 0.73, 0.57, 0.65, 1.00, 0.57, 0.71, 0.82, 0.61, 0.72, 0.50, 0.54, 0.65, 0.66, 0.71, 0.55, 0.79, 0.48, 0.55, 1.20, 0.73, 0.60, 0.84, 0.60, 0.42, 11, 7, 8, 7, 7, 6, 6, 10, 7, 10, 7, 7, 12, 2, 9, 10, 10, 8, 5, 11, 7, 9, 9, 9, 8, 9, 6, 10, 9, 7, 9, 7, 7, 10, 10, 11, 10, 5, 8, 11, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 97, 86, 90, 85, 87, 91, 90, 94, 91, 93, 92, 89, 84, 81, 85, 87, 96, 93, 95, 93, 63, 79, 91, 85, 74, 72, 80, 59, 75, 90, 66, 85, 72, 82, 80, 59, 74, 62, 67, 66, 86, 80, 79, 70, 77, 85, 60, 72, 83, 68, 72, 79, 67, 71, 61, 72, 78, 85, 67, 80, 75, 79, 80, 72, 56, 66, 74, 69, 79, 73, 69, 61, 79, 66, 80, 70, 62, 54, 57, 63, 35.61, 37.01, 20.15, 33.26, 28.34, 33.47, 44.28, 36.14, 37.42, 25.19, 23.63, 26.32, 43.70, 32.40, 39.32, 35.62, 39.95, 35.62, 30.21, 30.37, 29.23, 44.45, 26.12, 27.98, 60.77, 31.91, 49.62, 35.68, 55.67, 25.70, 35.21, 33.01, 37.46, 53.20, 33.20, 34.59, 39.66, 35.09, 32.30, 44.49, 0.65, 0.49, 0.75, 0.19, 0.59, 0.48, 0.77, 0.90, 0.65, 0.59, 0.55, 0.56, 0.57, 0.69, 0.73, 0.48, 0.43, 0.37, 0.47, 0.50, 0.61, 0.65, 1.12, 0.65, 0.71, 0.79, 0.64, 0.70, 0.68, 0.73, 0.58, 0.97, 0.56, 0.51, 1.30, 0.70, 0.74, 1.24, 0.65, 0.98, 10, 7, 9, 8, 8, 5, 6, 10, 8, 11, 9, 9, 10, 3, 10, 12, 10, 9, 5, 11, 3, 6, 5, 5, 1, 9, 7, 11, 6, 3, 4, 3, 1, 7, 7, 4, 5, 2, 6, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 26, 34, 27, 22, 26, 35, 43, 31, 39, 25, 31, 38, 14, 16, 33, 13, 27, 15, 19, 39"
rawImpactNames <- c('subject', 'condition', 'vermem1', 'vismem1', 'vms1', 'rt1', 'ic1', 'sym1', 'vermem2', 'vismem2', 'vms2', 'rt2', 'ic2', 'sym2')
splitImpactData <- stringr::str_split(rawImpactData, ",")
impactRawMatrix <- matrix(data=splitImpactData[[1]], ncol=length(rawImpactNames))
colnames(impactRawMatrix) <- rawImpactNames
rawImpactDF <- as.data.frame(impactRawMatrix, stringsAsFactors=FALSE)
for (intCtr in c(1, 3:ncol(rawImpactDF))) { rawImpactDF[, intCtr] <- as.numeric(rawImpactDF[, intCtr]) }
rawImpactDF$condition <- factor(stringr::str_replace_all(rawImpactDF$condition, " ", ""))
impact <- rawImpactDF
# Summary statistics entire dataset
psych::describe(impact)
## vars n mean sd median trimmed mad min max range
## subject 1 40 20.50 11.69 20.50 20.50 14.83 1.00 40.00 39.00
## condition* 2 40 1.50 0.51 1.50 1.50 0.74 1.00 2.00 1.00
## vermem1 3 40 89.75 6.44 91.00 90.44 6.67 75.00 98.00 23.00
## vismem1 4 40 74.88 8.60 75.00 74.97 9.64 59.00 91.00 32.00
## vms1 5 40 34.03 3.90 33.50 34.02 3.62 26.29 41.87 15.58
## rt1 6 40 0.67 0.15 0.65 0.66 0.13 0.42 1.20 0.78
## ic1 7 40 8.28 2.05 8.50 8.38 2.22 2.00 12.00 10.00
## sym1 8 40 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00
## vermem2 9 40 82.00 11.02 85.00 82.97 9.64 59.00 97.00 38.00
## vismem2 10 40 71.90 8.42 72.00 72.19 10.38 54.00 86.00 32.00
## vms2 11 40 35.83 8.66 35.15 34.98 6.89 20.15 60.77 40.62
## rt2 12 40 0.67 0.22 0.65 0.65 0.13 0.19 1.30 1.11
## ic2 13 40 6.75 2.98 7.00 6.81 2.97 1.00 12.00 11.00
## sym2 14 40 13.88 15.32 7.00 12.38 10.38 0.00 43.00 43.00
## skew kurtosis se
## subject 0.00 -1.29 1.85
## condition* 0.00 -2.05 0.08
## vermem1 -0.70 -0.51 1.02
## vismem1 -0.11 -0.96 1.36
## vms1 0.08 -0.75 0.62
## rt1 1.14 2.21 0.02
## ic1 -0.57 0.36 0.32
## sym1 3.98 14.16 0.03
## vermem2 -0.65 -0.81 1.74
## vismem2 -0.28 -0.87 1.33
## vms2 0.86 0.65 1.37
## rt2 0.93 1.29 0.03
## ic2 -0.16 -1.06 0.47
## sym2 0.44 -1.47 2.42
# Calculate correlation coefficient
entirecorr <- round(cor(impact$vismem2, impact$vermem2), 2)
# Summary statistics subsets
psych::describeBy(impact, impact$condition)
##
## Descriptive statistics by group
## group: concussed
## vars n mean sd median trimmed mad min max range
## subject 1 20 30.50 5.92 30.50 30.50 7.41 21.00 40.00 19.00
## condition* 2 20 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00
## vermem1 3 20 89.65 7.17 92.50 90.56 5.93 75.00 97.00 22.00
## vismem1 4 20 74.75 8.03 74.00 74.25 8.15 63.00 91.00 28.00
## vms1 5 20 33.20 3.62 33.09 33.27 3.32 26.29 39.18 12.89
## rt1 6 20 0.66 0.17 0.63 0.64 0.13 0.42 1.20 0.78
## ic1 7 20 8.55 1.64 9.00 8.62 1.48 5.00 11.00 6.00
## sym1 8 20 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00
## vermem2 9 20 74.05 9.86 74.00 73.88 11.86 59.00 91.00 32.00
## vismem2 10 20 69.20 8.38 69.50 69.62 10.38 54.00 80.00 26.00
## vms2 11 20 38.27 10.01 35.15 37.32 7.73 25.70 60.77 35.07
## rt2 12 20 0.78 0.23 0.70 0.74 0.11 0.51 1.30 0.79
## ic2 13 20 5.00 2.53 5.00 4.88 2.97 1.00 11.00 10.00
## sym2 14 20 27.65 9.07 27.00 27.75 11.12 13.00 43.00 30.00
## skew kurtosis se
## subject 0.00 -1.38 1.32
## condition* NaN NaN 0.00
## vermem1 -0.79 -0.70 1.60
## vismem1 0.45 -0.72 1.80
## vms1 -0.13 -0.78 0.81
## rt1 1.38 2.41 0.04
## ic1 -0.39 -0.81 0.37
## sym1 3.82 13.29 0.05
## vermem2 0.07 -1.24 2.21
## vismem2 -0.27 -1.26 1.87
## vms2 0.77 -0.57 2.24
## rt2 1.09 -0.10 0.05
## ic2 0.39 -0.28 0.57
## sym2 -0.11 -1.25 2.03
## --------------------------------------------------------
## group: control
## vars n mean sd median trimmed mad min max range skew
## subject 1 20 10.50 5.92 10.50 10.50 7.41 1.00 20.00 19.00 0.00
## condition* 2 20 2.00 0.00 2.00 2.00 0.00 2.00 2.00 0.00 NaN
## vermem1 3 20 89.85 5.82 90.00 90.31 7.41 78.00 98.00 20.00 -0.41
## vismem1 4 20 75.00 9.34 77.00 75.50 9.64 59.00 88.00 29.00 -0.46
## vms1 5 20 34.86 4.09 34.39 34.85 4.92 27.36 41.87 14.51 0.09
## rt1 6 20 0.67 0.13 0.66 0.67 0.13 0.42 1.00 0.58 0.47
## ic1 7 20 8.00 2.41 7.50 8.12 2.22 2.00 12.00 10.00 -0.41
## sym1 8 20 0.05 0.22 0.00 0.00 0.00 0.00 1.00 1.00 3.82
## vermem2 9 20 89.95 4.36 90.50 90.06 5.19 81.00 97.00 16.00 -0.25
## vismem2 10 20 74.60 7.76 74.50 75.00 8.15 60.00 86.00 26.00 -0.23
## vms2 11 20 33.40 6.44 34.54 33.52 6.30 20.15 44.28 24.13 -0.25
## rt2 12 20 0.57 0.16 0.56 0.57 0.13 0.19 0.90 0.71 -0.16
## ic2 13 20 8.50 2.31 9.00 8.69 1.48 3.00 12.00 9.00 -0.73
## sym2 14 20 0.10 0.31 0.00 0.00 0.00 0.00 1.00 1.00 2.47
## kurtosis se
## subject -1.38 1.32
## condition* NaN 0.00
## vermem1 -0.87 1.30
## vismem1 -1.27 2.09
## vms1 -1.19 0.91
## rt1 -0.02 0.03
## ic1 -0.17 0.54
## sym1 13.29 0.05
## vermem2 -1.02 0.97
## vismem2 -1.11 1.73
## vms2 -0.77 1.44
## rt2 0.06 0.04
## ic2 -0.32 0.52
## sym2 4.32 0.07
# Create 2 subsets: control and concussed
control <- subset(impact, condition == "control")
concussed <- subset(impact, condition == "concussed")
# Calculate correlation coefficients for each subset
controlcorr <- round(cor(control$vismem2, control$vermem2), 2)
concussedcorr <- round(cor(concussed$vismem2, concussed$vermem2), 2)
# Display all values at the same time
correlations <- cbind(entirecorr, controlcorr, concussedcorr)
correlations
## entirecorr controlcorr concussedcorr
## [1,] 0.45 0.37 0.35
Chapter 2 - Introduction to Linear Regression
Introduction to regression:
Regression equations and the R-squared value:
Multiple linear regression:
Example code includes:
# Look at the dataset. Note that the variables we are interested in are on the 9th to 14th columns
head(impact)
## subject condition vermem1 vismem1 vms1 rt1 ic1 sym1 vermem2 vismem2
## 1 1 control 95 88 35.29 0.42 11 0 97 86
## 2 2 control 90 82 31.47 0.63 7 0 86 80
## 3 3 control 87 77 30.87 0.56 8 0 90 79
## 4 4 control 84 72 41.87 0.66 7 0 85 70
## 5 5 control 92 77 33.28 0.56 7 1 87 77
## 6 6 control 89 79 40.73 0.81 6 0 91 85
## vms2 rt2 ic2 sym2
## 1 35.61 0.65 10 0
## 2 37.01 0.49 7 0
## 3 20.15 0.75 9 0
## 4 33.26 0.19 8 0
## 5 28.34 0.59 8 1
## 6 33.47 0.48 5 0
# Create a correlation matrix for the dataset
correlations <- cor(impact[, 9:14])
# Create the scatterplot matrix for the dataset
corrplot::corrplot(correlations)
# Calculate the required means, standard deviations and correlation coefficient
mean_sym2 <- mean(impact$sym2)
mean_ic2 <- mean(impact$ic2)
sd_sym2 <- sd(impact$sym2)
sd_ic2 <- sd(impact$ic2)
r <- cor(impact$ic2,impact$sym2)
# Calculate the slope
B_1 <- r * ( sd_sym2 )/( sd_ic2 )
# Calculate the intercept
B_0 <- mean_sym2 - B_1 * mean_ic2
# Plot of ic2 against sym2
plot(x=impact$ic2, y=impact$sym2, main = "Scatterplot", ylab = "Symptoms", xlab = "Impulse Control")
# Add the regression line
abline(B_0, B_1, col = "red")
# Construct the regression model
model_1 <- lm(impact$sym2 ~ impact$ic2)
# Look at the results of the regression by using the summary function
summary(model_1)
##
## Call:
## lm(formula = impact$sym2 ~ impact$ic2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.441 -8.983 -5.309 9.127 29.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.2945 5.5090 5.318 4.9e-06 ***
## impact$ic2 -2.2844 0.7483 -3.053 0.00413 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.91 on 38 degrees of freedom
## Multiple R-squared: 0.1969, Adjusted R-squared: 0.1758
## F-statistic: 9.319 on 1 and 38 DF, p-value: 0.004125
# Create a scatter plot of Impulse Control against Symptom Score
plot(impact$sym2 ~ impact$ic2, main = "Scatterplot", ylab = "Symptoms", xlab = "Impulse Control")
# Add a regression line
abline(model_1, col = "red")
# Multiple Regression
model_2 <- lm(impact$sym2 ~ impact$ic2 + impact$vermem2)
# Examine the results of the regression
summary(model_2)
##
## Call:
## lm(formula = impact$sym2 ~ impact$ic2 + impact$vermem2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.274 -8.031 -2.703 6.245 27.962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.7639 14.7765 5.398 4.1e-06 ***
## impact$ic2 -1.0711 0.7335 -1.460 0.152690
## impact$vermem2 -0.7154 0.1981 -3.611 0.000898 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.12 on 37 degrees of freedom
## Multiple R-squared: 0.4062, Adjusted R-squared: 0.3742
## F-statistic: 12.66 on 2 and 37 DF, p-value: 6.482e-05
# Extract the predicted values
predicted <- fitted(model_2)
# Plotting predicted scores against observed scores
plot(predicted ~ impact$sym2, main = "Scatterplot", xlab = "Observed Scores", ylab = "Predicted Scores")
abline(lm(predicted ~ impact$sym2), col = "green")
Chapter 3 - Linear Regression Models (cont)
Estimation of coefficients - key concept is to minimize the residuals (specifically, residuals-squared):
Estimation of standardized and unstandardized regression coefficients:
Assumptions of linear regression:
Anscombe’s quartet:
Example code includes:
# Create a linear regression with `ic2` and `vismem2` as regressors
model_1 <- lm(impact$sym2 ~ impact$ic2 + impact$vismem2)
# Extract the predicted values
predicted_1 <- fitted(model_1)
# Calculate the squared deviation of the predicted values from the observed values
deviation_1 <- (impact$sym2 - predicted_1) ** 2
# Sum the squared deviations
SSR_1 <- sum(deviation_1)
SSR_1
## [1] 7236.338
# Create a linear regression with `ic2` and `vermem2` as regressors
model_2 <- lm(impact$sym2 ~ impact$ic2 + impact$vermem2)
# Extract the predicted values
predicted_2 <- fitted(model_2)
# Calculate the squared deviation of the predicted values from the observed values
deviation_2 <- (impact$sym2 - predicted_2) ** 2
# Sum the squared deviations
SSR_2 <- sum(deviation_2)
SSR_2
## [1] 5435.454
# Create a standardized simple linear regression
model_1_z <- lm(scale(impact$sym2) ~ scale(impact$ic2))
#Look at the output of this regression model
summary(model_1_z)
##
## Call:
## lm(formula = scale(impact$sym2) ~ scale(impact$ic2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4648 -0.5863 -0.3465 0.5958 1.9383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.487e-16 1.435e-01 0.000 1.00000
## scale(impact$ic2) -4.438e-01 1.454e-01 -3.053 0.00413 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9078 on 38 degrees of freedom
## Multiple R-squared: 0.1969, Adjusted R-squared: 0.1758
## F-statistic: 9.319 on 1 and 38 DF, p-value: 0.004125
# Extract the R-Squared value for this regression
r_square_1 <- summary(model_1_z)$r.square
#Calculate the correlation coefficient
corr_coef_1 <- sqrt(r_square_1)
# Create a standardized multiple linear regression
model_2_z <- lm(scale(impact$sym2) ~ scale(impact$ic2) + scale(impact$vismem2))
# Look at the output of this regression model
summary(model_2_z)
##
## Call:
## lm(formula = scale(impact$sym2) ~ scale(impact$ic2) + scale(impact$vismem2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4349 -0.5949 -0.3174 0.5331 1.9646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.450e-16 1.443e-01 0.000 1.0000
## scale(impact$ic2) -4.101e-01 1.526e-01 -2.688 0.0107 *
## scale(impact$vismem2) -1.171e-01 1.526e-01 -0.767 0.4479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9128 on 37 degrees of freedom
## Multiple R-squared: 0.2095, Adjusted R-squared: 0.1668
## F-statistic: 4.904 on 2 and 37 DF, p-value: 0.01291
# Extract the R-Squared value for this regression
r_square_2 <- summary(model_2_z)$r.squared
# Calculate the correlation coefficient
corr_coef_2 <- sqrt(r_square_2)
# Extract the residuals from the model
residual <- resid(model_2)
# Draw a histogram of the residuals
hist(residual)
# Extract the predicted symptom scores from the model
predicted <- fitted(model_2)
# Plot the residuals against the predicted symptom scores
plot(residual ~ predicted, main = "Scatterplot", xlab="Model 2 Predicted Scores", ylab="Model 2 Residuals" )
abline(lm(residual ~ predicted), col="red")
Chapter 1 - Inferential Ideas